RobotsElement
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using ZmjTool;
namespace ZmjConvert
{
public class RobotsElement
{
/// <summary>
/// 爬虫名称
/// </summary>
public string UserAgent { get; }
/// <summary>
/// 可能存在的sitemap的url
/// </summary>
public string[] SiteMapUrl { get; }
/// <summary>
/// 允许的url
/// </summary>
public string[] AllowUrl { get; }
/// <summary>
/// 不允许的url
/// </summary>
public string[] DisAllowUrl { get; }
/// <summary>
/// 其他的url
/// </summary>
public string[] OtherUrl { get; }
/// <summary>
/// 一个html节点
/// </summary>
/// <param name="name"></param>
/// <param name="atts"></param>
/// <param name="elements"></param>
public RobotsElement(string ua, string[] sm, string[] als, string[] dals, string[] ou)
{
UserAgent = ua ?? "*";
SiteMapUrl = sm ?? throw new ArgumentNullException("sm");
AllowUrl = als ?? throw new ArgumentNullException("als");
DisAllowUrl = dals ?? throw new ArgumentNullException("dals");
OtherUrl = ou ?? throw new ArgumentNullException("ou");
}
/// <summary>
/// 检查是否对某个爬虫有访问限制
/// </summary>
/// <param name="ua"></param>
/// <param name="eles"></param>
/// <returns></returns>
public static bool ContainsUserAgent(string ua, RobotsElement[] eles, ref RobotsElement el)
{
el = eles.FirstOrDefault(x => x.UserAgent == ua);
return el != null;
}
/// <summary>
/// 获取所有的link,即a的href的内容
/// </summary>
/// <param name="baseUrl"></param>
/// <returns></returns>
public static string[] GetAllAllowUrl(Uri srcurl, RobotsElement[] eles)
{
var array = new List<string>();
foreach (var item in eles.Select(x => x.AllowUrl)) array.AddRange(item.Select(x => new Uri(srcurl, x).AbsoluteUri).Where(x => x != null));
return array.Where(x => !string.IsNullOrWhiteSpace(x)).Distinct().ToArray();
}
/// <summary>
/// 获取可能存在的sitemapurl
/// </summary>
/// <param name="rooturl"></param>
/// <param name="eles"></param>
/// <returns></returns>
public static string[] GetSiteMapUrls(Uri srcurl, RobotsElement[] eles)
{
var array = new List<string>();
foreach (var item in eles.Select(x => x.SiteMapUrl)) array.AddRange(item.Select(x => new Uri(srcurl, x).AbsoluteUri).Where(x => x != null));
return array.Where(x => !string.IsNullOrWhiteSpace(x)).Distinct().ToArray();
}
/// <summary>
/// 尝试将string序列化为class数组
/// </summary>
/// <param name="htmlstr"></param>
/// <param name="eles"></param>
/// <param name="message">输出序列化中遇到的错误提示</param>
/// <returns></returns>
public static bool Parse(string htmlstr, ref RobotsElement[] eles, ref string message)
{
try
{
eles = Parse(htmlstr);
message = string.Empty;
return true;
}
catch (Exception e)
{
eles = null;
message = "Robots文档解析异常:" + e.Message;
return false;
}
}
/// <summary>
/// 将string序列化为class数组,但是会抛出异常
/// exception:
/// HtmlParseException
/// </summary>
/// <param name="htmlstr"></param>
/// <returns></returns>
public static RobotsElements[] Parse(string htmlstr)
{
List<RobotsElement> elements = new List<RobotsElement>();
var rl = htmlstr.Replace("\r", string.Empty).Split("\n".ToArray(), StringSplitOptions.RemoveEmptyEntries);
rl = rl.Where(x => !x.StartsWith("#")).ToArray();//踢出#开头的注释行
RobotsElement r;
int idx = 0;
while ((r = GetNextElement(rl, ref idx)) != null) elements.Add(r);
return elements.ToArray();
}
/// <summary>
/// 将指定的html字符串序列化为对象
/// </summary>
/// <param name="parent">父对象</param>
/// <param name="htmlstr">要序列化的字符串</param>
/// <param name="innertext">不可被对象化的字符串</param>
/// <param name="surplus">剩余的未处理的字符串</param>
/// <returns></returns>
public static RobotsElement GetNextElements(string[] its, ref int idx)
{
if (idx < 0 || idx >= its.Length) return null;
string stat = null;
string ua = null;
List<string> sm = new List<string>();
List<string> al = new List<string>();
List<string> dal = new List<string>();
List<string> ol = new List<string>();
do
{
var lstr = its[idx].ToLower();
if (lstr.StartsWith("user-agent"))
{
if (ua != null) break;
ua = GetValue(its[idx]);
}
else if (lstr.StartsWith("disallow"))
{
stat = "disallow";
dal.Add(GetValue(its[idx]));
}
else if (lstr.StartsWith("allow"))
{
stat = "allow";
al.Add(GetValue(its[idx]));
}
else if (lstr.StartsWith("sitemap"))
{
stat = "sitemap";
sm.Add(GetValue(its[idx]));
}
else if (stat == "disallow") dal.Add(its[idx].Trim());
else if (stat == "allow") al.Add(its[idx].Trim());
else if (stat == "sitemap") sm.Add(its[idx].Trim());
else ol.Add(its[idx].Trim());
} while (++idx < its.Length);
dal = dal.Where(x => !string.IsNullOrWhiteSpace(x)).ToList();
al = al.Where(x => !string.IsNullOrWhiteSpace(x)).ToList();
sm = sm.Where(x => !string.IsNullOrWhiteSpace(x)).ToList();
ol = ol.Where(x => !string.IsNullOrWhiteSpace(x)).ToList();
return new RobotsElements(ua, sm.ToArray(), al.ToArray(), dal.ToArray(), ol.ToArray());
}
/// <summary>
/// 从:号后面获取一个值,
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
public static string GetValues(string str)
{
int st = str.IndexOf(':');
return str.Substring(st + 1).Trim();
}
}
}