基于反序词典的中文逆向最大匹配分词

Time:2011-03-02 11:33:35. Author:millken.
只能简单的分词,但思路很不错。 根据CSV词典直接定位
/*
 *基于反序词典的中文逆向最大匹配分词
 *4e00 -9fa5 utf-8中文区域 19968 = 40869
 *
 **/
$str = <<toHalfWidth($str);
        $strLen = strlen($str);
        while($I < $strLen) {
            $stringTMP = substr($str,$I,1); 
            if ( ord($stringTMP) >=224 ) { 
                $this->_strArr[$i] = substr($str,$I,3); 
                $I = $I + 3;
                $this->_vp .= 3;
            }elseif( ord($stringTMP) >=192 ) { 
                $this->_strArr[$i] = substr($str,$I,2); 
                $I = $I + 2;
                $this->_vp .= 2;
            }else{ 
                $I = $I + 1;
                $this->_strArr[$i] = $stringTMP;
                $this->_vp .= 1;
            } 
            $i++;
        }
    }
    
    public function RMM() {
        $Len = count($this->_strArr);
        $i = 0;
        while(0 < $Len) {
            $str = $this->_strArr[$Len];
            if($this->_vp{$Len} == 1) {
                $this->_newArr[$i] = $str;
                $Len = $this->parseAscii($Len, $i);
            }else{
                $this->_newArr[$i] = $str;
				//检查人日期1年内、1999年末
				 if (strpos("`末内中底前间初年月日时分秒`", $str)) {
					 $Len = $this->parseDateTime($Len, $i);
				 }elseif(3 == $this->_vp{$Len} && false !== ($this->_words = $this->getWords($str))) {
                    $Len = $this->parseWord($Len, $i);
                 }
            }
            $Len --;
            $i ++;
        }
        krsort($this->_newArr);
        return $this->_newArr;
    }
    
    private function parseWord($L, $I) {
        $words = $this->_strArr[$L-1] . $this->_newArr[$I];
        $pos = strpos($this->_words, $words);
        if(false !== $pos) {
            $this->_newArr[$I] = $words;
            return $this->parseWord($L-1, $I);
        }else{
            //判断上下文是否我完全匹配词组
            if(6 < strlen($words) && !strpos($this->_words, ','.$this->_newArr[$I])) {
                $this->_newArr[$I] = substr($this->_newArr[$I], 3);
                return $L+1;
            }else{
               return $L;
            }
        }

    }
    
    private function getWords($word) {
        $file = new SplFileObject('./data.dic');
        $gbword = iconv('UTF-8', 'GBK', $word);
        //计算该字在词典中行的位置 ,公式 (254 - 160) * (x1 -177) + x2 - 161 + 1
        $targetline = 94 * (ord($gbword{0}) - 176) + ord($gbword{1}) - 160;
        if($targetline > 6427 || 0 > $targetline) return false;
 
        $file->seek($targetline - 1);
        $words = str_replace(',', "{$word},", $file->current() );
        return ','.$words;        
    }
    
    private function parseAscii($L, $I) {
        if($this->_vp{$L-1} == 1) {
            $this->_newArr[$I] = $this->_strArr[$L-1] . $this->_newArr[$I];
            return $this->parseAscii($L-1, $I);
        }else{
            return $L;
        }
    }
	
	private function parseDateTime($L, $I) {
		$str = $this->toHalfWidth($this->_strArr[$L-1]);
		if($this->isChineseNum($str) || is_numeric($str) || strpos('`年月日时分秒`', $str)) {
			$this->_newArr[$I] = $this->_strArr[$L-1] . $this->_newArr[$I];
			return $this->parseDateTime($L-1, $I);
		}else{
			return $L;
		}
	}
	
	public function isChineseNum($str) {
		return strpos(self::$_chineseNum, $str);
	}
	
	public function toHalfWidth($str) {
		$cnnumber=explode('|',self::$_cnSbcNum);
		$ennumber = explode('|','0|1|2|3|4|5|6|7|8|9|+|-|%|.|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z');
		return str_replace($cnnumber,$ennumber,$str);		
	}
}

$cp = new chineseParticiple();

$cp->wordSplit($str);
$t = $cp->RMM();
//print_r($cp->_strArr);
echo implode(' ',$t);

windows下国际化语言代码

Time:2011-01-26 12:32:29. Author:millken. Category:网络.
不同的系统平台下,有着不同的代码

css扒皮

Time:2011-01-20 11:08:50. Author:millken. Category:技术.
根据dom结构,采集所需的样式

a small MVC framework(update:2010-10-29)

Time:2011-01-08 02:30:17. Author:millken. Category:站务. Tags:作品,mkmvc.

2010-20-29

  1. fixed无法获取path_info时的BUG,参照了kohana框架
  2. added异常处理exception_handler

利用firebug的控制台来调试PHP程序

Time:2010-11-03 15:37:31. Author:millken. Category:技术. Tags:firebug,debug.
利用set_error_handler来捕捉错误,并将错误显示在firebug控制台内,这样代替print输出错误

jquery代码高亮插件SyntaxHighlighter修改版(update:2010-10-29)

Time:2010-10-29 15:46:53. Author:millken. Category:站务. Tags:jquery,作品.

此插件是在 http://allo.ave7.net/JQuery_with_SyntaxHighlighter  的基础上修改而来

SyntaxHighlighter是一个出色的语法高亮库,但实际使用,尤其是大量Js文件的包含,仍有不便之处。另外在W3C规范中,Pre元素是不能使用name属性的。这里通过JQuery动态加载所需的SyntaxHighlighter文件,将使用过程最简化。

反防盗链

Time:2010-10-18 11:20:28. Author:millken. Category:网络. Tags:防盗链,flash.
看我如何反防盗链

在石门按揭买了套房

Time:2010-10-13 03:10:47. Author:millken. Category:生活.
国庆把钱打给我姐,让她先帮我办理了首付事宜。 接下来就是等银行的通知。。 俺终于还是成了房奴!!!

a small php template class

Time:2010-09-06 12:04:51. Author:millken. Category:技术.

/**
 * MyBlogAdmin Blog Platform
 *
 * @author     millken
 * @copyright  Copyright (c) 2010 MyBlogAdmin
 * @license    GNU General Public License 2.0
 * @version    $Id: template.class.php 11 2010-06-14 15:13:17Z millken $
 */
/**
 * common of template
 * plugin by yourself
 *
 */
define('TEMPLATE_DIR', dirname(__FILE__) . DIRECTORY_SEPARATOR);
/* plugin path */
define('TEMPLATE_PLUGIN_DIR', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'template_plugin' . DIRECTORY_SEPARATOR);

class Template {
    public $ldel = "{";
    public $rdel = "}";
    public $template_dir = './templates/';
    public $compile_dir = './template_c/';
    public $compress = false;
    public $debug = false;
    public $_temp_key = array();
    public $_temp_val = array();
    static $version = '20100712';
    public function __construct() {
        $this->start_time = microtime(true);
    }
    public function __destruct(){
        unset($this->_vars);
    }
    public function assign( $tpl_var, $value = null) {
        if (is_array($tpl_var)) {
            foreach ($tpl_var as $_key => $_val) {
                if ($_key != '') {
                    $this->_vars[$_key] = is_array($_val)?$this->arrayToObject($_val):$_val;
                }
            }
        } else {
            if ($tpl_var != '') {
                $this->_vars[$tpl_var] = is_array($value)?$this->arrayToObject($value):$value;
            }
        }
    }
    private function arrayToObject($array) {
        if(!is_array($array)) {
            return $array;
        }
        $object = new stdClass();
        if (is_array($array) && count($array) > 0) {
          foreach ($array as $name=>$value) {
             $name = strtolower(trim($name));
             if (isset($name)) {
                $object->$name = $this->arrayToObject($value);
             }
          }
          return $object;
        }else{
          return FALSE;
        }
    }
    public function fetch( $templatename ) {
        $compilefile = $this->compile_dir . $templatename . '.php';
        $nowtime = $this->getDifftime($templatename);
        if( is_file($compilefile) && $this->debug == false ) {
            $content = file_get_contents( $compilefile );
            $oldtime = substr($content, 8, strpos($content,"*/") - 8);
            if( intval($oldtime) == $nowtime)
            return $this->_eval($content);
        }else{
            if(false == touch($compilefile))
            throw new Exception ("the compiled file: $compilefile unable writed!");
        }
        $content = $this->fetch_source( $templatename );
        $content = preg_replace(
            array('/\?>/','/<\?([php])/i',"/{$this->ldel}([^\{$this->rdel}\{$this->ldel}\n]*){$this->rdel}/e"),
            array('?>','<?\1',"\$this->doParse('\\1');"),
            $content );
        //die($content);
        $head = '';
        file_put_contents($compilefile , $head . $content);
        $content = $this->_eval($content);
        return $content;
    }

    public function display( $filename, $output = true ) {
        $content = $this->fetch($filename);
        if($this->compress)
            $content = preg_replace(array("~>\s+\r~","~>\s+\n~","~>\s+<~"),    array(">",">","><"),$content);
        $this->time = sprintf('%.4f', microtime(true) - $this->start_time);
        if($output)die($content);
        return $content;
    }

    protected function _eval( $content ) {
        ob_start();
        eval('?' . '>' . trim($content));
        $content = ob_get_contents();
        ob_end_clean();
        return $content;
    }

    protected function doParse( $tag ) {
        $tag = stripslashes(trim($tag));

        if (empty($tag)) {
            return '{}';
        }elseif($tag{0} == '*' && substr($tag, -1) == '*') { // 注释部分
            return '';
        }elseif($tag{0} == '$'){ // variable
            $tags = explode('.',substr($tag,1));
            $var = array_shift($tags);
            $vart = '';
            if(!empty($tags)) {
                $vart = '->' . implode('->', $tags);
            }
            return '_vars[\'' . $var . '\']' . $vart . '; ?>';
        }elseif($tag{0} == '/'){ // end tag
            $plug = 'End' . substr($tag, 1) ;
        }else{
            $plug = array_shift(explode(' ', $tag));
        }

        $plugin = self::loadPlugin($plug);
        if(!$plugin)return '{ '. $tag .' }';
        return $plugin->compile($tag);

    }

    private function push_vars($key, $val)    {
        if (!empty($key))
            array_push($this->_temp_key, "\$this->_vars['$key']='" .$this->_vars[$key] . "';");
        if (!empty($val))
            array_push($this->_temp_val, "\$this->_vars['$val']='" .$this->_vars[$val] . "';");
    }

    private function pop_vars() {
        $key = array_pop($this->_temp_key);
        $val = array_pop($this->_temp_val);

        if (!empty($key))eval($key);
    }

    public function fetch_source( $filename ) {
        $file = $this->template_dir . $filename;
        if(is_file( $file )) {
            return file_get_contents( $file );
        }else{
            throw new Exception ('template \'' . $filename . '\'not exists!');
        }
    }

    public function loadPlugin($plugin_name)    {
        $classname = 'Template_Plugin_' . ucfirst(strtolower($plugin_name));
        if (class_exists($classname, false) && method_exists('Template_Plugin','compile'))
        return new $classname;

        $filename = TEMPLATE_PLUGIN_DIR . strtolower($classname) . '.php';
        if(is_file($filename)) {
            include_once ($filename);
            return new $classname;
        }
        return false;
    }
    private function mkdir( $path ) {
        return is_dir($path) or (self::mkdir(dirname($path)) and mkdir($path, 0777));
    }
    private function getDifftime($templatefile) {
        return filemtime($this->template_dir . $templatefile);
    }
    public function setDelimiter($left_delimiter, $right_delimiter) {
        $this->ldel = $left_delimiter;
        $this->rdel = $right_delimiter;
    }
    public function setCompress( $is_compress = false ) {
        $this->compress = $is_compress;
    }
    public function setCompileDir ( $dir, $makedir = false ) {
        if($makedir)self::mkdir($dir);
        $this->compile_dir = $dir;
    }
    public function setTemplateDir( $dir ) {
        if(!is_dir($dir)) throw new Exception ("the directory : $dir not exists!");
        $this->template_dir = $dir . DIRECTORY_SEPARATOR;
    }
}

/*
 * interface for template's plugin
 */
interface Template_Plugin {
    public function compile($tag);
}
class Template_Plugin_Foreach implements Template_Plugin {
    function compile($tag) {
        preg_match_all("/foreach\s+\\$([\w\.]+)\s+as\s+(\\$(\w+)\s*$|\\$(\w+)\s*=>\s*\\$(\w+)$)/i", $tag, $var );
        //print_r($var);
        $key = $val = $output = '';
        if( $var[3][0] ) {
            $key = trim( $var[3][0] );
            $as = '$this->_vars[\''. $key .'\']';
        }elseif( $var[4][0] && $var[5][0] ) {
            $key = trim( $var[4][0] );
            $val = trim( $var[5][0] );
            $as = '$this->_vars[\'' . $key . '\'] => $this->_vars[\'' . $val . '\']';
        }
        $tags = explode('.',$var[1][0]);
        $var1 = array_shift($tags);
        $vart = '';
        if(!empty($tags)) {
            $vart = '->' . implode('->', $tags);
            $token =  '$this->_vars[\'' . $var1 . '\']' . $vart . '';
        }else
            $token = '$this->_vars[\''. $var[1][0] .'\']';
        $output = 'push_vars(\'' . $key .'\', \''. $val .'\'); foreach((array)'. $token .' as '. $as .'){ ?>';
        return $output;
    }

}

class Template_plugin_EndForeach implements Template_Plugin {
    function compile($tag) {
        $output = 'pop_vars(); ?>';
        return $output;
    }
}
class Template_Plugin_If implements Template_Plugin {
    function compile($tag) {
        preg_match_all('/\-?\d+[\.\d]+|\'[^\'|\s]*\'|"[^"|\s]*"|[\$\w\.]+|!==|===|==|!=|<>|<<|>>|<=|>=|&&|\|\||\(|\)|,|\!|\^|=|&|<|>|~|\||\%|\+|\-|\/|\*|\@|\S/', $tag, $match);
        $tokens = $match[0];
        //允许使用的函数列表
        $functionlist = array('strtolower','strtoupper','strlen','urldecode','in_array','array_exists','array_keys','array_values');
        unset($tokens[0]);
        for ($i = 1, $count = count($tokens); $i < $count; $i++) {
            $token = &$tokens[$i];
            switch (strtolower($token)) {
                case 'eq':$token = '==';break;
                case 'ne':break;
                case 'neq':$token = '!=';break;
                case 'lt':$token = '<';break;
                case 'le':break;
                case 'lte':$token = '<=';break;
                case 'gt':$token = '>';break;
                case 'ge':break;
                case 'gte':$token = '>=';break;
                case 'and':$token = '&&';break;
                case 'or':$token = '||';break;
                case 'not':$token = '!';break;
                case 'mod':$token = '%';break;
                default:
                    if ($token[0] == '$') {
                        $tags = explode('.',substr($token, 1));
                        $var = array_shift($tags);
                        $vart = '';
                        if(!empty($tags)) {
                            $vart = '->' . implode('->', $tags);
                            $token =  '$this->_vars[\'' . $var . '\']' . $vart . '';
                        }else
                            $token = '$this->_vars[\''. substr($token, 1) .'\']';
                    }elseif(function_exists($token) && !in_array($token, $functionlist))
                    throw new Exception('can\'t use function:'.$token.'');
                break;
            }
        }
        return $this->output($tokens);
    }

    function output($tokens) {
        return '';
    }
}

class Template_Plugin_Elseif extends Template_Plugin_If  {
    function output($tokens) {
        return '';
    }

}

class Template_plugin_Else implements Template_Plugin {
    function compile($tag) {
        $output = '';
        return $output;
    }
}
class Template_plugin_EndIf implements Template_Plugin {
    function compile($tag) {
        $output = '';
        return $output;
    }
}
class Template_Plugin_Include implements Template_Plugin {
    function compile($tag) {
        $file = preg_replace('/include\s*file\=["\']?\s*([a-zA-Z0-9_.\/]+)\s*[\'"]?\s*/is', '\\1', $tag );
        return 'fetch(' . "'$file'" . '); ?>';
    }
}

根据链路自动判别启用哪条网络

Time:2010-09-06 04:17:18. Author:millken. Category:网络.
国内还是走国内,国外就走VPN。

jquery简单实现图片自动缩放

Time:2010-07-28 04:08:04. Author:millken. Category:技术. Tags:作品,jquery.
$(document).ready(function() {
    $('img[data^=maxsize_]').each(function(){
        var t = this;
        var autoScaling = function() {
            var size = $(t).attr('data').substring(8);
            wh=size.split("*");
            wh1 = isNaN(wh[1])?wh[0]:wh[1];
            max = Math.max(t.width/wh[0], t.height/wh1);
            if(max>1)$(t).css({'width':t.width/max,'height':t.height/max});
        }
        if(this.complete)autoScaling();
        $(this).load(function(){autoScaling();
        });
    });
});

调用上面代码后,只要img含有类似data="maxsize_300*200",就可以实现自动缩放。

javascriptErrorLogger

Time:2010-07-28 03:00:58. Author:millken. Category:技术.

var JavascriptErrorLogger = {
    initialize: function() {
        window.onerror = JavascriptErrorLogger.onError_handler;
    },

    onError_handler: function(errorMessage, errorUrl, lineNumber) {
        var browserCodeName = navigator.appCodeName;
        var browserAppName = navigator.appName;
        var browserName = navigator.product;
        var browserVersion = navigator.appVersion;
        var browserLanguage = ((typeof (navigator.browserLanguage) != 'undefined') ? navigator.browserLanguage : navigator.language);
        var userPlatform = navigator.platform;
        var userAgent = navigator.userAgent;
        var windowLocationUrl = window.location.href;

        var loggerURL = '/WebServices/ClientLogging.asmx/WriteJavascriptErrorToLog';
        var requestParameters = '?errorMessage=' + encodeURIComponent(errorMessage) + '&errorUrl=' + encodeURIComponent(errorUrl) + '&lineNumber=' + encodeURIComponent(lineNumber) + '&browserCodeName=' + encodeURIComponent(browserCodeName) + '&browserAppName=' + encodeURIComponent(browserAppName) + '&browserName=' + encodeURIComponent(browserName) + '&browserVersion=' + encodeURIComponent(browserVersion) + '&browserLanguage=' + encodeURIComponent(browserLanguage) + '&userPlatform=' + encodeURIComponent(userPlatform) + '&userAgent=' + encodeURIComponent(userAgent) + '&windowLocationUrl=' + encodeURIComponent(windowLocationUrl);

        var webServiceUrl = '' + loggerURL + requestParameters;
        jQuery.ajax({
            type: 'GET',
            url: webServiceUrl,
            dataType: 'text'
        });
    }
};
JavascriptErrorLogger.initialize();



关于我:

陈震(millken) 男 26岁 高级程序员

湖南石门人,现混迹于上海。

联系方式:millken#gmail.com

开放分类