网页关键词频率计算(词频计算js版)
不需要词库,直接分割网页内容提取词语.并且计算词语出现次数按照从多到少排序, 这里能区分中英文词语,最开始用于广告匹配数据预处理.用js在访客客户端执行
<script type="text/javascript">
//石卓林 2008-7-12 第二版.左右右左匹配版
function keywords(ftitle,ftbody){
this.trim = function(text){return text.replace(/(^\s*)|(\s*$)/g,'');}
this.title = ftitle;
this.tbody = ftbody.replace(/(\s+)/g,' ');//.substr(40,400);//截取最可能的内容此处数字需改进
this.tbody = this.trim(this.tbody);
this.tbodylen = this.tbody.length;
this.chardic = new ActiveXObject('Scripting.Dictionary');
this.tempasc = 0;
this.tempchar = '';
this.tempcharat='';
this.endchar = '。,:… (—)》《';
this.chscount = 0;
this.keys = new Array();
var oldchar='',oldcount=0;
for(var i=0;i<this.tbodylen;i++){
this.chscount = 0;
for(var j=1;j<=15;j++){//最长英文单词15
this.tempchar = this.tbody.substr(i,j);
this.tempasc = this.tempchar.charCodeAt(j-1);
this.tempcharat = this.tempchar.charAt(j-1);
if((this.endchar.indexOf(this.tempcharat) != -1)||(this.tempasc >=0 && this.tempasc <= 47)||(this.tempasc >=58 && this.tempasc <= 64)||(this.tempasc >=91 && this.tempasc <= 96)||(this.tempasc >=123 && this.tempasc <= 254)){
break;
}else{
if(this.tempasc <= 0 || this.tempasc >= 254){this.chscount++;}
if(this.chscount>8){break;}//最长中文词语8
if(this.tempchar.length>1){//只提取大于1词
cx = eval('this.title.match(/'+this.tempchar+'/g)');
if(cx != null){cx=cx.length*2}else{cx=0};//标题匹配加权 2 倍权重
cx = eval('this.tbody.match(/'+this.tempchar+'/g).length');
if(cx>1){//丢弃只出现一次的词语
this.chardic.item(this.tempchar) = cx;
if(oldcount>0 && this.tempchar.indexOf(oldchar)==0){//从左到右匹配
this.chardic.item(oldchar) = oldcount - cx;
if(this.chardic.item(oldchar)<2){//丢弃只出现一次的词语
this.chardic.Remove(oldchar);
}
}
oldchar = this.tempchar;
oldcount= cx;
};
}
}
}
}
//转换数组 从右到左匹配.
var a = (new VBArray(this.chardic.Keys())).toArray();
var b = (new VBArray(this.chardic.Items())).toArray();
var c = a.length;
var ti = 0;
for(var i=c-1;i>=0;i--){
if(i>0){
upa = a[i];
upb = b[i];
doa = a[i-1];
dob = b[i-1];
if(doa.indexOf(upa)==doa.length-upa.length){
upb = upb - dob;
}
if(upb>1){//丢弃只出现一次的词语
this.keys[ti] = {};
this.keys[ti].str = upa;
this.keys[ti].count=upb;
ti++;
}
}else{
this.keys[ti] = {}
this.keys[ti].str = doa;
this.keys[ti].count=dob;
}
}
//按匹配次数排序
var py = true;
c = this.keys.length;
while(py){
py = false;
for(var i=0;i<c;i++){
if((i+1)>=c){break}
pyd = this.keys[i].count;
pye = this.keys[i+1].count;
if(pyd<pye){
pyf = this.keys[i].count;
pys = this.keys[i].str;
this.keys[i].count = this.keys[i+1].count;
this.keys[i].str = this.keys[i+1].str;
this.keys[i+1].count = pyf;
this.keys[i+1].str = pys;
py = true;
}
}
}
return this.keys;
}
document.attachEvent('onreadystatechange',fnStartInit);
//window.onload = fnStartInit;
function fnStartInit(){
frames[0].location.href = 'http://www.163.com';
// //if(document.readyState=="interactive"){/
// document.write('<iframe scrolling="no" frameborder="0" width="240" height="120"></iframe>');
//var word = new keywords(document.title,document.body.innerText);
// //document.write(word.join(' | '));
// //}
}
</script>
<iframe scrolling="no" frameborder="0" width="240" height="320" name="adid" id="adid"></iframe>