中科院分词工具的使用
中科院分词工具java的配置与里面自带的讲解相同,下面是代码
package xieru;
import hello.Hello.CLibrary;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.regex.Pattern;
import com.csvreader.CsvReader;
import com.csvreader.CsvWriter;
import com.sun.jna.Library;
import com.sun.jna.Native;
public class WriteSeparatewords {
public interface CLibrary extends Library {
// 定义并初始化接口的静态变量
CLibrary Instance = (CLibrary) Native.loadLibrary(
"E:\\workplace\\hello\\NLPIR", CLibrary.class);
// printf函数声明
public boolean NLPIR_Init(byte[] sDataPath, int encoding,
byte[] sLicenceCode);
public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged);
public String NLPIR_GetKeyWords(String sLine,int nMaxKeyLimit,boolean bWeightOut);
public void NLPIR_Exit();
}
public static String transString(String aidString, String ori_encoding,
String new_encoding) {
try {
return new String(aidString.getBytes(ori_encoding), new_encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return null;
}
public static void it(String inFile,String outFile)throws IOException {
File file=new File(inFile);
FileInputStream fi=new FileInputStream(file);
CsvReader cr=new CsvReader(fi,',', Charset.forName("GBK"));
cr.readHeaders();
String[] readerS;
FileWriter wr=new FileWriter(outFile);
BufferedWriter bw=new BufferedWriter(wr);
while(cr.readRecord()){
readerS=cr.getValues();
if(readerS[10].equals("技术")){
for(int i=0;i<readerS.length-1;i++)
bw.write("\""+readerS[i]+"\""+",");
bw.write("\""+readerS[readerS.length-1]+"\"");
bw.newLine();
}
}
bw.flush();
bw.close();
}
public static void fenci(String inFile,String outFile) throws IOException{
String argu = "";
String system_charset = "UTF-8";
int charset_type = 1;
// int charset_type = 0;
// 调用printf打印信息
if (!CLibrary.Instance.NLPIR_Init(argu.getBytes(system_charset),
charset_type, "0".getBytes(system_charset))) {
System.err.println("初始化失败!");
}
String filePath=inFile;
File file=new File(filePath);
FileInputStream fi=new FileInputStream(file);
CsvReader cr=new CsvReader(fi,',', Charset.forName("GBK"));
cr.readHeaders();
String[] readerS;
FileWriter wr=new FileWriter(outFile);
BufferedWriter bw = new BufferedWriter(wr);
while(cr.readRecord()){
readerS=cr.getValues();
String nativeBytes=null;
nativeBytes = CLibrary.Instance.NLPIR_ParagraphProcess(readerS[1], 3);
bw.write(readerS[0]+","+"\""+nativeBytes+"\""+",");
System.out.println("分词结果为: " + nativeBytes);
String nativeByte = CLibrary.Instance.NLPIR_GetKeyWords(readerS[1],10,true);
bw.write(nativeByte);
System.out.println("关键词提取结果是:"+nativeByte);
bw.newLine();
System.out.println("-----------------------------------");
}
bw.flush();
bw.close();
CLibrary.Instance.NLPIR_Exit();
}
public static void main(String[] args) throws Exception {
// WriteSeparatewords.fenci("F:/c/zhiweiyaoqiu.csv", "F:/c/fenci.csv");
WriteSeparatewords.it("F:/c/zhaopinxinxi.csv", "F:/c/it.csv");
}
} 


