#!/usr/local/bin/ruby -w
#
# Extract data from Discus Freeware message board
#
# This script scans all the flat files created by the Discus Freeware software
# and outputs them in CSV format.
#
# See the readme file for details.
require 'CGI'
# Config options
#
# * path_to_raw
# sub directory/directories containing your flat files
# * topic_file
# output file for topics
# * message_file
# output file for messages
# * post_file
# output file for posts
path_to_raw = Dir["raw/*"]
topic_file = 'topics.csv'
message_file = 'messages.csv'
post_file = 'posts.csv'
def escape_for_csv(text)
return if text.nil?
text = CGI::unescapeHTML(text)
text = text.gsub(/\"/,'""')
text = text.gsub(/[ ]*
[ ]*/i,"\n\n")
text = text.gsub(/[ ]{2,}/, " ")
text = text.gsub(/\n/, "")
text = text.strip
end
def date_for_csv(seconds)
Time.at(seconds.to_i).strftime("%Y-%m-%d %H:%M:%S")
end
def sanitize_to_basic_html(raw)
return "" if raw.nil? or raw.empty?
html = sanitize_to_text(raw)
html = html.gsub(/[\n|\r\n|\r]+/, "\n")
nl2p(html)
end
def nl2p(raw)
# convert new lines to paragraphs
return "" if raw.nil? or raw.empty?
# trim line breaks from start and end of string
html = raw
html.gsub!(/^[\n|\r\n|\r]/, '')
html.gsub!(/[\n|\r\n|\r]$/, '')
html.gsub!(/^(.*)$/, '
\1
') end ftopic = File.open(topic_file, "w") fmsg = File.open(message_file, "w") fpost = File.open(post_file, "w") ftopic.puts "id,title" fmsg.puts "id,topic_id,title" fpost.puts "id,message_id,created_at,created_by,content" topics = Hash.new() current_topic_id = 0 # iterate through the raw files path_to_raw.each do |pathin| fin = File.open(pathin, "r") message_id = fin.path[/[0-9]+/] current_message = nil current_post = Hash.new fin.each do |line| if line =~ /$/) unless topics.has_key?(current_topic[1]) topics[current_topic[1]] = current_topic[2] current_topic_id = current_topic[1] ftopic.puts '"' + current_topic[1] + '","' + escape_for_csv(current_topic[2]) + '"' end end if line =~ /^