<?php
/**
* 新浪博客文章内容解析
*/
header("content-type:application/json; charset=utf-8");
error_reporting(0); // 关闭所有PHP错误报告
$url = 'https://blog.sina.com.cn/s/blog_4d89b834010300iu.html'; // 替换成你要抓取的新浪博客文章URL
if (empty($url)) {
exit(json_encode(['code' => 203, 'msg' => '文章链接为空,例如https://blog.sina.com.cn/s/blog_4d89b834010300iu.html'], 320));
}
$data = curl($url);
preg_match('/<h2 id="t_.*?" class="titName SG_txta">(.*?)<\/h2>/', $data, $title); // 匹配文章标题
preg_match('/<div id="sina_keyword_ad_area2" class="articalContent[\s\S]*?>([\s\S]*?)<\/div>/', $data, $content); // 匹配文章内容
if (empty($title[1]) || empty($content[1])) {
exit(json_encode(['code' => 204, 'msg' => '解析文章失败'], 320));
}
$value = [
'code' => 1,
'msg' => '获取成功',
'data' => [
'title' => $title[1],
'content' => $content[1]
]
];
echo json_encode($value, 320);
function curl($url) // Curl GET
{
$ch = curl_init(); // Curl 初始化
$timeout = 30; // 超时时间:30s
$ua = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'; // 伪造抓取 UA
$ip = mt_rand(11, 191) . "." . mt_rand(0, 240) . "." . mt_rand(1, 240) . "." . mt_rand(1, 240);
curl_setopt($ch, CURLOPT_URL, $url); // 设置 Curl 目标
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); // Curl 请求有返回的值
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); // 设置抓取超时时间
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); // 跟踪重定向
curl_setopt($ch, CURLOPT_REFERER, 'https://www.baidu.com/'); // 模拟来路
curl_setopt($ch, CURLOPT_HTTPHEADER, array('X-FORWARDED-FOR:' . $ip, 'CLIENT-IP:' . $ip)); // 伪造IP
curl_setopt($ch, CURLOPT_USERAGENT, $ua); // 伪造ua
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); // https请求 不验证证书和hosts
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0); // 强制协议为1.0
curl_setopt($ch, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4); // 强制使用IPV4协议解析域名
$content = curl_exec($ch);
curl_close($ch); // 结束 Curl
return $content; // 函数返回内容
}
还没有评论呢,快来抢沙发~