网页关键词频率计算(词频计算js版)

不需要词库,直接分割网页内容提取词语.并且计算词语出现次数按照从多到少排序, 这里能区分中英文词语,最开始用于广告匹配数据预处理.用js在访客客户端执行

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
<script type="text/javascript">
//石卓林 2008-7-12 第二版.左右右左匹配版
function keywords(ftitle,ftbody){
    this.trim = function(text){return text.replace(/(^\s*)|(\s*$)/g,'');}
    this.title = ftitle;
    this.tbody = ftbody.replace(/(\s+)/g,' ');//.substr(40,400);//截取最可能的内容此处数字需改进
    this.tbody = this.trim(this.tbody);
    this.tbodylen = this.tbody.length;
    this.chardic = new ActiveXObject('Scripting.Dictionary');   
    this.tempasc = 0;
    this.tempchar = '';
    this.tempcharat='';
    this.endchar = '。,:… (—)》《';
    this.chscount = 0;
    this.keys = new Array();
    var oldchar='',oldcount=0;
    for(var i=0;i<this.tbodylen;i++){
        this.chscount = 0;
        for(var j=1;j<=15;j++){//最长英文单词15
            this.tempchar = this.tbody.substr(i,j);
            this.tempasc = this.tempchar.charCodeAt(j-1);
            this.tempcharat = this.tempchar.charAt(j-1);
            if((this.endchar.indexOf(this.tempcharat) != -1)||(this.tempasc >=0 && this.tempasc <= 47)||(this.tempasc >=58 && this.tempasc <= 64)||(this.tempasc >=91 && this.tempasc <= 96)||(this.tempasc >=123 && this.tempasc <= 254)){
                break;
            }else{
                if(this.tempasc <= 0 || this.tempasc >= 254){this.chscount++;}
                if(this.chscount>8){break;}//最长中文词语8
                if(this.tempchar.length>1){//只提取大于1词
                    cx = eval('this.title.match(/'+this.tempchar+'/g)');
                    if(cx != null){cx=cx.length*2}else{cx=0};//标题匹配加权 2 倍权重
                    cx = eval('this.tbody.match(/'+this.tempchar+'/g).length');
                    if(cx>1){//丢弃只出现一次的词语
                        this.chardic.item(this.tempchar) = cx;
                        if(oldcount>0 && this.tempchar.indexOf(oldchar)==0){//从左到右匹配
                            this.chardic.item(oldchar) = oldcount - cx;
                            if(this.chardic.item(oldchar)<2){//丢弃只出现一次的词语
                                this.chardic.Remove(oldchar);
                            }
                        }
                        oldchar = this.tempchar;
                        oldcount= cx;
                    };
                }
            }
        }
    }
    //转换数组 从右到左匹配.
    var a = (new VBArray(this.chardic.Keys())).toArray();
    var b = (new VBArray(this.chardic.Items())).toArray();
    var c = a.length;
    var ti = 0;
    for(var i=c-1;i>=0;i--){
        if(i>0){
            upa = a[i];
            upb = b[i];
            doa = a[i-1];
            dob = b[i-1];
            if(doa.indexOf(upa)==doa.length-upa.length){
                upb = upb - dob;
            }
            if(upb>1){//丢弃只出现一次的词语
                this.keys[ti] = {};
                this.keys[ti].str = upa;
                this.keys[ti].count=upb;
                ti++;
            }
        }else{
                this.keys[ti] = {}
                this.keys[ti].str = doa;
                this.keys[ti].count=dob;
        }
    }
    //按匹配次数排序
    var py = true;
        c = this.keys.length;
    while(py){
        py = false;
        for(var i=0;i<c;i++){
            if((i+1)>=c){break}
            pyd = this.keys[i].count;
            pye = this.keys[i+1].count;
            if(pyd<pye){
                pyf = this.keys[i].count;
                pys = this.keys[i].str;
                this.keys[i].count = this.keys[i+1].count;
                this.keys[i].str = this.keys[i+1].str;
                this.keys[i+1].count = pyf;
                this.keys[i+1].str = pys;
                py = true;
            }
        }
    }
    return this.keys;   
}
 
 
 
 
document.attachEvent('onreadystatechange',fnStartInit);
//window.onload = fnStartInit;
function fnStartInit(){
    frames[0].location.href = 'http://www.163.com';
//  //if(document.readyState=="interactive"){/
//      document.write('<iframe scrolling="no" frameborder="0" width="240" height="120"></iframe>');
        //var word = new keywords(document.title,document.body.innerText);
//      //document.write(word.join(' | '));
//  //}
}
</script>
<iframe scrolling="no" frameborder="0" width="240" height="320" name="adid" id="adid"></iframe>

Leave a Reply

Your email address will not be published.

Time limit is exhausted. Please reload the CAPTCHA.

Proudly powered by WordPress   Premium Style Theme by www.gopiplus.com
渝公网安渝公网安备 50010702500270号 渝ICP备09056628号-7