#!/usr/local/bin/ruby -w # # Extract data from Discus Freeware message board # # This script scans all the flat files created by the Discus Freeware software # and outputs them in CSV format. # # See the readme file for details. require 'CGI' # Config options # # * path_to_raw # sub directory/directories containing your flat files # * topic_file # output file for topics # * message_file # output file for messages # * post_file # output file for posts path_to_raw = Dir["raw/*"] topic_file = 'topics.csv' message_file = 'messages.csv' post_file = 'posts.csv' def escape_for_csv(text) return if text.nil? text = CGI::unescapeHTML(text) text = text.gsub(/\"/,'""') text = text.gsub(/[ ]*
[ ]*/i,"\n\n") text = text.gsub(/[ ]{2,}/, " ") text = text.gsub(/\n/, "") text = text.strip end def date_for_csv(seconds) Time.at(seconds.to_i).strftime("%Y-%m-%d %H:%M:%S") end def sanitize_to_basic_html(raw) return "" if raw.nil? or raw.empty? html = sanitize_to_text(raw) html = html.gsub(/[\n|\r\n|\r]+/, "\n") nl2p(html) end def nl2p(raw) # convert new lines to paragraphs return "" if raw.nil? or raw.empty? # trim line breaks from start and end of string html = raw html.gsub!(/^[\n|\r\n|\r]/, '') html.gsub!(/[\n|\r\n|\r]$/, '') html.gsub!(/^(.*)$/, '

\1

') end ftopic = File.open(topic_file, "w") fmsg = File.open(message_file, "w") fpost = File.open(post_file, "w") ftopic.puts "id,title" fmsg.puts "id,topic_id,title" fpost.puts "id,message_id,created_at,created_by,content" topics = Hash.new() current_topic_id = 0 # iterate through the raw files path_to_raw.each do |pathin| fin = File.open(pathin, "r") message_id = fin.path[/[0-9]+/] current_message = nil current_post = Hash.new fin.each do |line| if line =~ /$/) unless topics.has_key?(current_topic[1]) topics[current_topic[1]] = current_topic[2] current_topic_id = current_topic[1] ftopic.puts '"' + current_topic[1] + '","' + escape_for_csv(current_topic[2]) + '"' end end if line =~ /^/ current_message = Hash.new current_message["id"] = message_id current_message["topic_id"] = current_topic_id current_message["title"] = line.match(/Discussion Board: (.[^<]+)/)[1] end if line =~ /<!--Parent: -->/ # pages with no parents are topic indexes, not messages current_message = nil end # extract comment <!--Post: 14--> if line =~ /<!--Post: / current_post = Hash.new current_post["id"] = line.match(/<!--Post: ([0-9]+)/)[1] end current_post["created_at"] = line.match(/<!--Time: ([0-9]+)/)[1] if line =~ /<!--Time: / current_post["created_by"] = line.match(/<!--name-->(.[^<]+)/)[1] if line =~ /<!--name-->/ current_post["content"] = line.match(/<!--Text-->(.+)<!--\/Text-->/)[1] if line =~ /<!--Text-->/ if line =~ /<!--\/Post: / fpost.puts current_post["id"] + ',' + message_id + ',"' + date_for_csv(current_post["created_at"]) + '","' + escape_for_csv(current_post["created_by"]) + '","' + nl2p(escape_for_csv(current_post["content"])) + '"' end end fin.close unless current_message.nil? fmsg.puts current_message["id"].to_s + ',' + current_message["topic_id"].to_s + ',"' + escape_for_csv(current_message["title"].to_s) + '"' end end fmsg.close fpost.close ftopic.close puts 'Done' exit