目的:解析xml文件,并存入mysql,并且要解析的字段能一一对应.这里解析的是微博的文件,想要利用里面的article和person_id字段.
思路:
为了能得到person_id和article能一一对应.因此对两个字段分别解析,并且定义一个私有变量ct,在重载的函数startElement中自动加1.这个ct作为插入mysql中的article和person_id的主键即(ct,article)和(ct,person_id),在分别插入两张不同的表a和b之后,两个表做连接操作,实现article和person_id的一一对应(曲线救国啊!!!)
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.BufferedReader;import java.io.FileReader;import java.io.FileWriter; public class sax_parse_xml extends DefaultHandler { java.util.Stack tags = new java.util.Stack(); private long ct=0;public static boolean isLegalXMLCharacter(int ch) { if (ch <= 0xD7FF) { if(ch<=0x0){return false;} if (ch >= 0x20) { return true; } else { return ch == '\n' || ch == '\r' || ch == '\t'; } } else{ return (ch >= 0xE000 && ch <= 0xFFFD) || (ch >= 0x10000 && ch <= 0x10FFFF); } } public sax_parse_xml() { super(); } public static void main(String args[]) { long lasting = System.currentTimeMillis(); try { SAXParserFactory sf = SAXParserFactory.newInstance(); SAXParser sp = sf.newSAXParser(); sax_parse_xml reader = new sax_parse_xml(); sp.parse(new InputSource("/home/hadoop/weibo_content_corpus/nlpir_weibo_content"), reader); } catch (Exception e) { e.printStackTrace(); } System.out.println((int)'运'); System.out.println("运行时间:" + (System.currentTimeMillis() - lasting) + "毫秒"); } public void characters(char ch[], int start, int length) throws SAXException { String tag = (String) tags.peek(); String ch1 = ""; String ch2=""; //System.out.print(ch.length); //long ct=0; //下面的程序向文件写入解析的xml的结果 File file = new File("/home/hadoop/weibo_content_corpus", "addfile.txt"); if(!file.exists()) { try { file.createNewFile(); // 创建文件 } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } File file1 = new File("/home/hadoop/weibo_content_corpus", "add_id.txt"); if(!file1.exists()) { try { file1.createNewFile(); // 创建文件 } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } // 向文件写入内容(输出流) String str = "java外挖出1\n"; byte bt[] = new byte[1024]; bt = str.getBytes(); /* try { // 打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件 FileWriter writer = new FileWriter("/home/hadoop/weibo_content_corpus/addfile.txt", true); writer.write(str); writer.close(); } catch (IOException e) { e.printStackTrace(); }*/ //上面的程序向文件写入解析的xml的结果 if (tag.equals("article")) { System.out.println("article:"); String tmpStr=new String(ch, start, length); if(tmpStr.trim().length()>0) { //System.out.println(new String(ch, start, length)); ch1="insert into tb_xml_article_hd1 values ("+ct+","+"\""+tmpStr+"\""+");";//生成导入mysql的脚本 bt = ch1.getBytes(); try { // 打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件 FileWriter writer = new FileWriter("/home/hadoop/weibo_content_corpus/addfile.txt", true); writer.write(ch1+"\n"+"commit;"+"\n"); writer.close(); } catch (IOException e) { e.printStackTrace(); } System.out.println(ch1); // ct++; //StringBuffer sb = new StringBuffer(); //sb.delete(0, sb.length()); /* for (int i=start;i<length;i++) { if(Character.isDefined(ch[i]))//(isLegalXMLCharacter(ch[i])) { //System.out.println(ch[i]); } }*/ // System.out.println(start); //System.out.println(length); //sb.append(ch, start, length); //System.out.println(ch1); } } if (tag.equals("person_id")) { //ch1=ct+":"+new String(ch, start, length); String tmpStr=new String(ch, start, length); if(tmpStr.trim().length()>0) { /*ch1="insert into tb_xml_person_hd values ("+(ct-4)+","+"\""+tmpStr+"\""+");"; bt = ch1.getBytes(); try { // 打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件 FileWriter writer = new FileWriter("/home/hadoop/weibo_content_corpus/add_id.txt", true); writer.write(ch1+"\n"+"commit;"+"\n"); writer.close(); } catch (IOException e) { e.printStackTrace(); } System.out.println(ch1); */ } //System.out.println("personid:"); //System.out.println( new String(ch, start, length)); // ch1=ch1+new String(ch, start, length); //ch1=new String(ch, start, length); // ch2=new String(ch, start, length); // System.out.println(ch1); } if (tag.equals("time")) { // System.out.println("time:"); // System.out.println(new String(ch, start, length)); // ch1=ch1+new String(ch, start, length); //ch1.concat(new String(ch, start, length)); //System.out.println(ch1); } // System.out.println(ch1); //ch1=""; } public void startElement(String uri, String localName, String qName, Attributes attrs) { tags.push(qName); ct=ct+1; //System.out.println(ct);} }