根据dom结构,采集所需的样式

css扒皮

Time:2011-01-20 11:08:50. Author:millken. Category:技术.


class Grabstyle {
	public $styles = array();
	public $styles1 = array();
	public $links = array();
	public $class = array();
	public $CSS = array();
	public $id = array();
	public $newStyles = array();
	public $html = '';
	public $url = '';
	public function parseHtml($html) {
		preg_match_all("~href=\"([^\"]+\.css)(?:.*?)\"[^>]*>~i",$html,$links);
		$this->links = $links[1];
		foreach((array)$this->links as $link) {
			$cssContent = file_get_contents($link);
			preg_match_all("/url\(\s*(?:[\"\'])?(.*?)\s*(?:[\"\'])?\)|src\=(?:[\"\'])([^'\"]+)/isx", $cssContent, $imagesURLArray);
			$imagesURLArray = array_unique(array_filter(array_merge((array)$imagesURLArray[1], $imagesURLArray[2])));
			$imagesEXPArray = $this->_expandlinks($imagesURLArray, $link);
			foreach((array)$imagesEXPArray as $key=>$val) {
				$cssContent = str_replace($imagesURLArray[$key], $val, $cssContent);
			}
			$this->parseStyle($cssContent);
		}
		$this->styles1 = $this->styles;
		$this->getResetStyle();
		//解析calss
		preg_match_all("~ class=\"([^\"]+)~i",$html,$all);
		$this->class = $all[1];
		array_walk($this->class, array($this, 'getStyle'), 'class');
		//解析id
		preg_match_all("~ id=\"([^\"]+)~i",$html,$all);
		$this->id = $all[1];
		array_walk($this->id, array($this, 'getStyle'));

		foreach($arr=array_unique($this->CSS) as $key) {
			$this->newStyles[$key] = "$key { " . $this->styles1[$key] ." }";
		}

		return implode("\n", $this->newStyles);
	}
	public function getResetStyle() {
		foreach($this->styles1 as $key=>$val) {
			if($key{0} != '.' && $key{0} != '#') {
				$this->CSS[] = $key;
				unset($this->styles[$key]);
			}
		}
	}

	public function getStyle($key, $key2, $type='') {
		$key = ($type == 'class' ? '.' : '#').$key;
		foreach($stylekey = array_keys($this->styles) as $val) {
			if(strstr($val, $key)) {
				$this->CSS[] = $val;
				unset($this->styles[$val]);
			}
		}
	}
	public function parseUrl($url) {
		$this->url = $url;
		$this->html = file_get_contents($this->url);
		return $this->parseHtml($this->html);
	}
	public function parseStyle($css) {
		preg_match_all("~(.*?)\{(.*?)\}~isx",$css,$style);
		$this->styles = array_merge($this->styles, array_combine($style[1], $style[2]));

		//d($this->styles);
	}
	public function _expandlinks($links,$URI) {

		preg_match("/^[^\?]+/",$URI,$match);

		$match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]);
		$match = preg_replace("|/$|","",$match);
		$match_part = parse_url($match);
		$match_root =
		$match_part["scheme"]."://".$match_part["host"];

		$search = array( 	"|^http://".preg_quote($match_part["host"])."|i",
							"|^(\/)|i",
							"|^(?!http://)(?!mailto:)|i",
							"|/\./|",
							"|/[^\/]+/\.\./|"
						);

		$replace = array(	"",
							$match_root."/",
							$match."/",
							"/",
							"/"
						);

		$expandedLinks = preg_replace($search,$replace,$links);

		return $expandedLinks;
	}

}
function d($var){
	echo '
';
	print_r($var);
	echo '
';
}
//<\s*a\s.*?href\s*=\s*([\"\'])?(?(1)(.*?)\\1|([^\s\>]+)) [^>]*>?(.*?)</a>
$obj = new Grabstyle();
echo $obj->parseUrl('t.htm');
//$c = $obj->_expandlinks('./baidu/ss.html','http://a.com/tttttt/aa.html');
//d($c);

</pre>

关于我:

陈震(millken) 男 26岁 高级程序员

湖南石门人,现混迹于上海。

联系方式:millken#gmail.com

开放分类