C#标签类文本序列化库 > RobotsElement


RobotsElement


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using ZmjTool;

namespace ZmjConvert
{
    public class RobotsElement
    {
        /// <summary>
        /// 爬虫名称
        /// </summary>
        public string UserAgent { get; }
        /// <summary>
        /// 可能存在的sitemap的url
        /// </summary>
        public string[] SiteMapUrl { get; }
        /// <summary>
        /// 允许的url
        /// </summary>
        public string[] AllowUrl { get; }
        /// <summary>
        /// 不允许的url
        /// </summary>
        public string[] DisAllowUrl { get; }
        /// <summary>
        /// 其他的url
        /// </summary>
        public string[] OtherUrl { get; }
        /// <summary>
        /// 一个html节点
        /// </summary>
        /// <param name="name"></param>
        /// <param name="atts"></param>
        /// <param name="elements"></param>
        public RobotsElement(string ua, string[] sm, string[] als, string[] dals, string[] ou)
        {
            UserAgent = ua ?? "*";
            SiteMapUrl = sm ?? throw new ArgumentNullException("sm");
            AllowUrl = als ?? throw new ArgumentNullException("als");
            DisAllowUrl = dals ?? throw new ArgumentNullException("dals");
            OtherUrl = ou ?? throw new ArgumentNullException("ou");
        }
        /// <summary>
        /// 检查是否对某个爬虫有访问限制
        /// </summary>
        /// <param name="ua"></param>
        /// <param name="eles"></param>
        /// <returns></returns>
        public static bool ContainsUserAgent(string ua, RobotsElement[] eles, ref RobotsElement el)
        {
            el = eles.FirstOrDefault(x => x.UserAgent == ua);
            return el != null;
        }
        /// <summary>
        /// 获取所有的link,即a的href的内容
        /// </summary>
        /// <param name="baseUrl"></param>
        /// <returns></returns>
        public static string[] GetAllAllowUrl(Uri srcurl, RobotsElement[] eles)
        {
            var array = new List<string>();
            foreach (var item in eles.Select(x => x.AllowUrl)) array.AddRange(item.Select(x => new Uri(srcurl, x).AbsoluteUri).Where(x => x != null));
            return array.Where(x => !string.IsNullOrWhiteSpace(x)).Distinct().ToArray();
        }
        /// <summary>
        /// 获取可能存在的sitemapurl
        /// </summary>
        /// <param name="rooturl"></param>
        /// <param name="eles"></param>
        /// <returns></returns>
        public static string[] GetSiteMapUrls(Uri srcurl, RobotsElement[] eles)
        {
            var array = new List<string>();
            foreach (var item in eles.Select(x => x.SiteMapUrl)) array.AddRange(item.Select(x => new Uri(srcurl, x).AbsoluteUri).Where(x => x != null));
            return array.Where(x => !string.IsNullOrWhiteSpace(x)).Distinct().ToArray();
        }
        /// <summary>
        /// 尝试将string序列化为class数组
        /// </summary>
        /// <param name="htmlstr"></param>
        /// <param name="eles"></param>
        /// <param name="message">输出序列化中遇到的错误提示</param>
        /// <returns></returns>
        public static bool Parse(string htmlstr, ref RobotsElement[] eles, ref string message)
        {
            try
            {
                eles = Parse(htmlstr);
                message = string.Empty;
                return true;
            }
            catch (Exception e)
            {
                eles = null;
                message = "Robots文档解析异常:" + e.Message;
                return false;
            }
        }
        /// <summary>
        /// 将string序列化为class数组,但是会抛出异常
        /// exception:
        ///     HtmlParseException
        /// </summary>
        /// <param name="htmlstr"></param>
        /// <returns></returns>
        public static RobotsElements[] Parse(string htmlstr)
        {
            List<RobotsElement> elements = new List<RobotsElement>();
            var rl = htmlstr.Replace("\r", string.Empty).Split("\n".ToArray(), StringSplitOptions.RemoveEmptyEntries);
            rl = rl.Where(x => !x.StartsWith("#")).ToArray();//踢出#开头的注释行
            RobotsElement r;
            int idx = 0;
            while ((r = GetNextElement(rl, ref idx)) != null) elements.Add(r);
            return elements.ToArray();
        }
        /// <summary>
        /// 将指定的html字符串序列化为对象
        /// </summary>
        /// <param name="parent">父对象</param>
        /// <param name="htmlstr">要序列化的字符串</param>
        /// <param name="innertext">不可被对象化的字符串</param>
        /// <param name="surplus">剩余的未处理的字符串</param>
        /// <returns></returns>
        public static RobotsElement GetNextElements(string[] its, ref int idx)
        {
            if (idx < 0 || idx >= its.Length) return null;
            string stat = null;
            string ua = null;
            List<string> sm = new List<string>();
            List<string> al = new List<string>();
            List<string> dal = new List<string>();
            List<string> ol = new List<string>();
            do
            {
                var lstr = its[idx].ToLower();
                if (lstr.StartsWith("user-agent"))
                {
                    if (ua != null) break;
                    ua = GetValue(its[idx]);
                }
                else if (lstr.StartsWith("disallow"))
                {
                    stat = "disallow";
                    dal.Add(GetValue(its[idx]));
                }
                else if (lstr.StartsWith("allow"))
                {
                    stat = "allow";
                    al.Add(GetValue(its[idx]));
                }
                else if (lstr.StartsWith("sitemap"))
                {
                    stat = "sitemap";
                    sm.Add(GetValue(its[idx]));
                }
                else if (stat == "disallow") dal.Add(its[idx].Trim());
                else if (stat == "allow") al.Add(its[idx].Trim());
                else if (stat == "sitemap") sm.Add(its[idx].Trim());
                else ol.Add(its[idx].Trim());
            } while (++idx < its.Length);
            dal = dal.Where(x => !string.IsNullOrWhiteSpace(x)).ToList();
            al = al.Where(x => !string.IsNullOrWhiteSpace(x)).ToList();
            sm = sm.Where(x => !string.IsNullOrWhiteSpace(x)).ToList();
            ol = ol.Where(x => !string.IsNullOrWhiteSpace(x)).ToList();
            return new RobotsElements(ua, sm.ToArray(), al.ToArray(), dal.ToArray(), ol.ToArray());
        }
        /// <summary>
        /// 从:号后面获取一个值,
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public static string GetValues(string str)
        {
            int st = str.IndexOf(':');
            return str.Substring(st + 1).Trim();
        }
    }
}