sen のアーカイブに入っていたサンプルコードをちょっとだけ手直し。
PreProcessor や PostProcessor を生成するためのクラスを作った。

# でも、残念ながら今の自分には不要なもの……


// Edited by NI-Lab. 2007-01-12
/*
 * StreamTaggerDemo2.java - StreamTaggerDemo2 is demonstration program for Sen.
 * 
 * Copyright (C) 2002 Takashi Okamoto, Tsuyoshi Fukui Takashi Okamoto
 * <tora@debian.org> Tsuyosh Fukui <fukui556@oki.com>
 * 
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *  
 */
 
import java.io.*;
import java.util.*;
 
import javax.xml.parsers.*;
 
import net.java.sen.*;
import net.java.sen.processor.*;
 
import org.w3c.dom.*;
import org.xml.sax.*;
 
public class ProcessorFactory {
 
  public static void main(String args[]) {
  }
  
  private final String senProcessorConfigFile;
  private String compositRule = "";
  private boolean isCompound = true;
  private String compoundFile = null;
  private String remarkRule = "";
  
  /**
   * @param senProcessorConfigFile sen-processor.xml ファイルのパス
   */
  public ProcessorFactory(String senProcessorConfigFile){
    this.senProcessorConfigFile = senProcessorConfigFile;
    readConfig();
  }
 
  public CompoundWordPostProcessor createCompoundWordPostProcessor(){
    //if (!isCompound) {
      CompoundWordPostProcessor cwProcessor =
        new CompoundWordPostProcessor(compoundFile);
      return cwProcessor;
    //}else{
    //  throw new IllegalStateException();
    //}
  }
 
  public CompositPostProcessor createCompositPostProcessor() throws IOException {
    if (compositRule != null && !compositRule.equals("")) {
      CompositPostProcessor processor = new CompositPostProcessor();
      processor.readRules(new BufferedReader(new StringReader(compositRule)));
      return processor;
    }else{
      throw new IllegalStateException();
    }
  }
 
  public RemarkPreProcessor createRemarkPreProcessor() throws IOException{
      if (remarkRule != null && !remarkRule.equals("")) {
        RemarkPreProcessor processor = new RemarkPreProcessor();
        processor.readRules(new BufferedReader(new StringReader(remarkRule)));
        return processor;
    }else{
      throw new IllegalStateException();
    }
  }
  
  public RemarkPostProcessor createRemarkPostProcessor(){
      if (remarkRule != null && !remarkRule.equals("")) {
        RemarkPostProcessor p2 = new RemarkPostProcessor();
        return p2;
    }else{
      throw new IllegalStateException();
    }
  }
  
  private void readConfig() {
    String parent = new File(senProcessorConfigFile).getParentFile().getParent();
    try {
      DocumentBuilderFactory factory = DocumentBuilderFactory
          .newInstance();
      DocumentBuilder builder = factory.newDocumentBuilder();
      Document doc = builder.parse(new InputSource(senProcessorConfigFile));
      NodeList nl = doc.getFirstChild().getChildNodes();
 
      for (int i = 0; i < nl.getLength(); i++) {
        org.w3c.dom.Node n = nl.item(i);
        if (n.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
          String nn = n.getNodeName();
          String value = n.getFirstChild().getNodeValue();
 
          if (nn.equals("composit")) {
            compositRule += value + "\n";
          }
          if (nn.equals("compound")) {
            if (value.equals("\u69cb\u6210\u8a9e")) { // "構成語"という文字列
              isCompound = false;
            }
          }
          if (nn.equals("remark")) {
            remarkRule += value + "\n";
          }
          if (nn.equals("dictionary")) {
            // read nested tag in <dictinary>
            NodeList dnl = n.getChildNodes();
            for (int j = 0; j < dnl.getLength(); j++) {
              org.w3c.dom.Node dn = dnl.item(j);
              if (dn.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
 
                String dnn = dn.getNodeName();
                if (dn.getFirstChild() == null) {
                  throw new IllegalArgumentException(
                      "element '" + dnn + "' is empty");
                }
                String dvalue = dn.getFirstChild()
                    .getNodeValue();
 
                if (dnn.equals("compound")) {
                  compoundFile = SenUtils.getPath(dvalue,
                      parent);
                }
              }
            }
          }
        }
      }
      if (!isCompound) {
        try {
          ObjectInputStream is = new ObjectInputStream(
              new FileInputStream(compoundFile));
          HashMap hashmap = (HashMap) is.readObject();
        } catch (ClassNotFoundException e1) {
          throw new RuntimeException(e1);
        }
      }
    } catch (ParserConfigurationException e) {
      throw new IllegalArgumentException(e.getMessage());
    } catch (FileNotFoundException e) {
      throw new IllegalArgumentException(e.getMessage());
    } catch (SAXException e) {
      throw new IllegalArgumentException(e.getMessage());
    } catch (IOException e) {
      throw new IllegalArgumentException(e.getMessage());
    }
  }
 
}

いつかまたどこかで使うかもしれないのでここに残しておく。

tags: zlashdot Java Java MeCabSen

Posted by NI-Lab. (@nilab)