加入收藏 | 设为首页 | 会员中心 | 我要投稿 厦门站长网 (https://www.0592zz.com/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 大数据 > 正文


发布时间:2021-03-17 08:47:24 所属栏目:大数据 来源:网络整理
导读:余弦相似性算法的具体介绍参考:http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html 下面是我根据上边的介绍进行的java语言的实现: import java.io.IOException;import java.io.StringReader;import java.util.Collections;import java.util.C



import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.wjb.util.common.WjbTuple2;

public class CosineTextSimilarity {

    public static Map<String,Integer> makeTermFrequency(String text) throws IOException
        Analyzer analyzer = new IKAnalyzer(true);
        StringReader reader = new StringReader(text);
        TokenStream ts = analyzer.tokenStream("",reader);  
        CharTermAttribute term=ts.getAttribute(CharTermAttribute.class); 
        Map<String,Integer> tf = new HashMap<String,Integer>();
            String t = term.toString();
            Integer count = tf.get(t);
            if(count == null)
                tf.put(t,count + 1);
        return tf;

    /** * 根据key的长度进行过滤,只有key的长度不小于 length 时,这个key才会保留 * @param map * @param length * @return * @throws IOException */
    public static Map<String,Integer> filterByKeyLength(Map<String,Integer> map,int length) throws IOException
        Map<String,Integer> m = new HashMap<String,Integer>();
        for(String key : map.keySet())
             if(key == null || key.trim().length() >= length)
        return m;

    public static WjbTuple2<int[],int[]> makeVector(Map<String,Integer> first,Map<String,Integer> second){
         Set<String> keys = new HashSet<String>();
         int[] vector1 = new int[keys.size()];
         int[] vector2 = new int[keys.size()];
         int i = 0;
         for(String key : keys)
             Integer count1 = first.get(key);
             if(count1 != null)
                 vector1[i] = count1;
             Integer count2 = second.get(key);
             if(count2 != null)
                 vector2[i] = count2;

        return new WjbTuple2<int[],int[]>(vector1,vector2);

    public static double cosine(WjbTuple2<int[],int[]> tuple)
        int[] vector1 = tuple._1;
        int[] vector2 = tuple._2;

        double sum1 = 0;
        double sum21 = 0;
        double sum22 = 0;

        for (int i = 0; i < vector1.length; i++) {
            sum1 += vector1[i] * vector2[i];
            sum21 += vector1[i] * vector1[i];
            sum22 += vector2[i] * vector2[i];

        return sum1/(Math.sqrt(sum21 * sum22 ));

    public static List<Entry> sort(Map unsortMap) {

        // Convert Map to List
        List<Map.Entry> list = new LinkedList<Map.Entry>(unsortMap.entrySet());

        // Sort list with comparator,to compare the Map values
        Collections.sort(list,new Comparator<Map.Entry>() {
            public int compare(Map.Entry o1,Map.Entry o2) {
                String d1 = o1.getValue().toString();
                String d2 = o2.getValue().toString();
                String k1 = o1.getKey().toString();
                String k2 = o2.getKey().toString();
                if(o1.getValue() instanceof Integer)
                    Integer nd1 = Integer.parseInt(d1);
                    Integer nd2 = Integer.parseInt(d2);
                    if( nd2 - nd1 != 0 )
                        return nd2 - nd1;
                        return k2.compareTo(k1);
                    return d2.compareTo(d1);

        return list;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import com.wjb.util.common.WjbFileUtil;
import com.wjb.util.common.WjbTuple2;

public class Main {
    public static void main(String[] args) throws Exception {

        String text1 = WjbFileUtil.fromFile("d:/1.txt");
        String text2 = WjbFileUtil.fromFile("d:/2.txt",WjbFileUtil.GBK);

        long begin = System.currentTimeMillis();
        Map<String,Integer> map1 = CosineTextSimilarity.makeTermFrequency(text1);
        Map<String,Integer> map2 = CosineTextSimilarity.makeTermFrequency(text2);

//      map1 = CosineTextSimilarity.filterByKeyLength(map1,2);
//      map2 = CosineTextSimilarity.filterByKeyLength(map2,2);

        List<Entry> list1 = CosineTextSimilarity.sort(map1);
        list1 = list1.subList(0,list1.size() > 20 ? 20 : list1.size());

        List<Entry> list2 = CosineTextSimilarity.sort(map2);
        list2 = list2.subList(0,list2.size() > 20 ? 20 : list2.size());

        map1 = list2Map(list1);
        map2 = list2Map(list2);

        WjbTuple2<int[],int[]> tuple = CosineTextSimilarity.makeVector(map1,map2);
        double cos = CosineTextSimilarity.cosine(tuple);

        long end = System.currentTimeMillis();

        System.out.println(end - begin);


    public static Map<String,Integer> list2Map(List<Entry> list)
        Map<String,Integer> map = new HashMap<String,Integer>();
        for(Entry e : list)
        return map;

