C#标签类文本序列化库 > 用于html解析


用于html解析,这个一个element是html中的一个节点,包含若干个子节点


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Web;
using ZmjTool;

namespace ZmjConvert
{
    /// <summary>
    /// 用于html解析,这个一个element是html中的一个节点,包含若干个子节点
    /// </summary>
    public class HtmlElement : CHtmlElement<HtmlElement>
    {
        /// <summary>
        /// 某一系列tag单元的att属性比如所有的a的href
        /// </summary>
        /// <param name="tag"></param>
        /// <param name="att"></param>
        /// <returns></returns>
        public IEnumerable<string> this[string tag, string att]
        {
            get
            {
                return this[tag].Where(x => x.Attributes.ContainsKey(att)).Select(x => x.Attributes[att]);
            }
        }
        /// <summary>
        /// 获取当前对象的所有子内容的文本,不包含标签信息
        /// </summary>
        /// <param name="e"></param>
        /// <returns></returns>
        public string GetInnerText()
        {
            if (Childrens.Count < 1) return InnerText;
            var sb = new StringBuilder();
            foreach (var item in Childrens)
            {
                sb.Append(item.InnerText);
            }
            return sb.ToString();
        }
        /// <summary>
        /// 从ele中提取出所有的子ele
        /// </summary>
        /// <param name="eles"></param>
        /// <returns></returns>
        public static IEnumerable<HtmlElement> GetAllElement(IEnumerable<HtmlElement> ele)
        {
            var aes = new List<HtmlElement>();
            foreach (var item in ele.Where(x => x.TagName != "style" && x.TagName != "!--"))
            {
                aes.Add(item);
                aes.AddRange(GetAllElement(item.Children.ToArray()));
            }
            return aes;
        }
        /// <summary>
        /// 获取所有的link,即a的href的内容
        /// </summary>
        /// <param name="baseUrl"></param>
        /// <returns></returns>
        public static IEnumerable<Uri> GetAllLink(IEnumerable<HtmlElement> eles, Uri srcurl)
        {
            var lks = eles.Where(x => x.TagName == "a" && x.Attributes.ContainsKey("href")).Select(x => new Uri(srcurl, HttpUtility.HtmlDecode(x.Attributes["href"]))).Where(x => x != null);
            return lks.Distinct(new HttpTool.UriEquasComp());
        }
        /// <summary>
        /// 获取所有的网页上的图片的连接
        /// </summary>
        /// <returns></returns>
        public static Dictionary<Uri, string> GetAllImageSrc(IEnumerable<HtmlElement> ele, Uri srcurl)
        {
            var imgs = ele.Where(x => x.TagName == "img");
            var array = new Dictionary<Uri, string>();
            foreach (var item in imgs)
            {
                var alt = item.Attributes.ContainsKey("alt") ? item.Attributes["alt"] : item.Attributes.ContainsKey("title") ? item.Attributes["title"] : string.Empty;
                foreach (var src in item.Attributes.Where(x => x.Key.StartsWith("data-") || x.Key.StartsWith("src")).Select(x => x.Value))
                {
                    var srcc = new Uri(srcurl, HttpUtility.HtmlDecode(src));
                    if (srcc != null && !array.ContainsKey(srcc)) array.Add(srcc, alt);
                }
            }
            return array.Distinct(new HttpTool.DicStringUrlEquasComp()).ToDictionary(x => x.Key, y => y.Value);
        }
        /// <summary>
        /// 获取所有的media的连接
        /// </summary>
        /// <returns></returns>
        public static Dictionary<Uri, bool> GetAllMediaSrc(IEnumerable<HtmlElement> ele, Uri srcurl)
        {
            List<Uri> lv = new List<Uri>();
            var vids = new Dictionary<Uri, bool>();
            foreach (var v in eles.Where(x => x.Tag == "video" || x.Tag == "source"))
            {
                var l = v.Attributes.Where(x => x.Key.StartsWith("data-") || x.Key.StartsWith("src"));
                lv.AddRange(l.Select(x => new Uri(srcurl, HttpUtility.HtmlDecode(x.Value))).Where(x => x != null));
            }
            foreach (var item in lv.Distinct(new HttpTool.UriEquasComp()))
            {
                vids.Add(item, false);
            }
            foreach (var item in eles.Where(x => x.Tag == "script" && !string.IsNullOrWhiteSpace(x.InnerText)))
            {
                foreach (var uri in GetM3U8FromScript(item.InnerText, srcurl))
                {
                    vids.Add(uri, true);
                }
            }
            return vids;
        }
        /// <summary>
        /// 尝试获取视频的大小,如果可能的话还会有一个内部连接,导向其他的m3u8文件
        /// </summary>
        /// <param name="content"></param>
        /// <param name="w"></param>
        /// <param name="h"></param>
        /// <param name="contenturl"></param>
        /// <returns></returns>
        public static bool GetM3U8VideoSize(Uri src, string content, out int w, out int h, out TimeSpan time, Func<Uri, string> getcontent)
        {
            try
            {
                var dc = new M3U8Document(content, src);
                w = dc.Width;
                h = dc.Heigth;
                time = TimeSpan.FromSeconds(dc.UriList.Sum(x => x.Info));
                if (dc.OtherLine.FirstOrDefault() is string ourl)
                {//获取第一个字内容的内容
                    try
                    {//获取子内容的内容
                        var ouri = new Uri(src, ourl);
                        ourl = getcontent.Invoke(ouri);
                        var ndc = new M3U8Document(ourl, ouri);
                        w += ndc.Width;
                        h += ndc.Heigth;
                        time += TimeSpan.FromSeconds(ndc.UriList.Sum(x => x.Info));
                    }
                    catch (Exception) { }
                }
                return true;
            }
            catch (Exception)
            {
                w = 0;
                h = 0;
                time = TimeSpan.Zero;
                return false;
            }
        }
        /// <summary>
        /// 从script中找出m3u8视频
        /// </summary>
        /// <param name="script"></param>
        /// <returns></returns>
        public static IEnumerable<Uri> GetM3U8FromScript(string script, Uri src)
        {
            List<Uri> uris = new List<Uri>();
            string url;
            int idx = 0;
            while ((url = FindM3U8At(script, ref idx)) != null)
            {
                try
                {
                    uris.Add(new Uri(src, url));
                }
                catch (Exception)
                {
                    continue;
                }
            }
            return uris;
        }
        /// <summary>
        /// 从指定位置查找m3u8视频链接
        /// </summary>
        /// <param name="src"></param>
        /// <param name="idx"></param>
        /// <returns></returns>
        public static string FindM3U8At(string src, ref int idx)
        {
            if (idx >= src.Length) return null;//查找.m3u8标记
            var st = src.IndexOf(".m3u8", idx, StringComparison.OrdinalIgnoreCase);
            if (st < 0) return null;//没找到时
            var sst = src.LastIndexOf("http", st, StringComparison.OrdinalIgnoreCase);//查找http头
            if(sst < 0)
            {//没找到http头标记时可能是假的
                idx = st;
                return FindM3U8At(src, ref idx);
            }//查找一个结尾标记,这是因为有些可能是m3u8?...这种形式
            var ed = src.IndexOfAny(" '\"(){}[]|!~^".ToCharArray(), st);
            ed = ed > 0 ? ed : sst + 5;
            var v = src.Substring(sst, ed - sst);
            idx = ed + 1;//有的网站他就这么做了,emm
            return v.Replace("\\", string.Empty);
        }
        /// <summary>
        /// 尝试将string序列化为class数组
        /// </summary>
        /// <param name="htmlstr"></param>
        /// <param name="eles"></param>
        /// <param name="message">输出序列化中遇到的错误提示</param>
        /// <returns></returns>
        public static bool TryParse(string htmlstr, out IEnumerable<HtmlElement> eles, out string message)
        {
            try
            {
                htmlstr = Regex.Replace(htmlstr, @"[\f\n\r\t\v]", string.Empty);
                eles = Parse(htmlstr);
                message = string.Empty;
                return true;
            }
            catch (Exception e)
            {
                eles = null;
                message = "html文档解析异常:" + e.Message;
                return false;
            }
        }
        /// <summary>
        /// 获取所有的inner字符
        /// </summary>
        /// <param name="element"></param>
        /// <returns></returns>
        public static string GetAllInnerText(IEnumerable<HtmlElement> element)
        {
            StringBuilder sb = new StringBuilder();
            foreach (var item in element.Where(x => x.TagName != "script" && x.TagName != "style" && x.TagName != "!--" && x.TagName != "title"))
            {
                sb.Append(HttpUtility.HtmlDecode(item.InnerText.Trim()));
            }
            return sb.ToString();
        }
        /// <summary>
        /// 获取可能存在的编码指示
        /// </summary>
        /// <param name="element"></param>
        /// <returns></returns>
        public new static Encoding GetEncoding(IEnumerable<HtmlElement> element)
        {
            string ce = null;
            var e = element.FirstOrDefault(x => x.Attributes.ContainsKey("charset"));
            if (e != null) ce = e.Attributes["charset"];
            e = element.FirstOrDefault(x => x.Attributes.ContainsKey("http-equiv") && x.Attributes["http-equiv"].ToLower() == "content-type" && x.Attributes.ContainsKey("content"));
            if (e != null)
            {
                var cce = e.Attributes["content"].ToLower();
                var st = cce.IndexOf("charset=");
                var ed = cce.IndexOfAny(" ;\"\'".ToArray(), st + 8);
                if (st >= 0) ce = cce.Substring(st + 8, ed < 0 ? cce.Length - st - 8 : ed - st - 8);
            }
            if (string.IsNullOrWhiteSpace(ce)) return null;
            //var eds = Encoding.GetEncodings().Select(x => x.Name).ToArray();
            //return eds.Contains(ce) ? Encoding.GetEncoding(ce) : null;
            try { return Encoding.GetEncoding(ce); }
            catch (Exception) { return null; }
        }
        /// <summary>
        /// 查找element,找出指定tag和属性name为指定值的ele的指定的attr
        /// </summary>
        /// <param name="tag"></param>
        /// <param name="filtername"></param>
        /// <param name="attrname"></param>
        /// <returns></returns>
        public static string GetElementAttr(IEnumerable<HtmlElement> element, string tag, string filtername, string attrname)
        {
            var e = element.FirstOrDefault(x => x.TagName == tag && x.Attributes.ContainsKey("name") && x.Attributes["name"].ToLower() == filtername && x.Attributes.ContainsKey(attrname));
            return e?.Attributes[attrname] ?? string.Empty;
        }
        /// <summary>
        /// 查找出网站的图标连接
        /// </summary>
        /// <param name="element"></param>
        /// <returns></returns>
        public static Uri GetWebIcon(IEnumerable<HtmlElement> element, Uri srcurl)
        {
            var e = element.FirstOrDefault(x => x.TagName == "link" && x.Attributes.ContainsKey("rel") && x.Attributes.ContainsKey("href") && (x.Attributes["rel"].ToLower().Contains("icon") || x.Attributes["rel"].ToLower().Contains("shortcut")));
            return new Uri(srcurl, HttpUtility.HtmlDecode(e?.Attributes["href"] ?? string.Empty));
        }
        /// <summary>
        /// 获取文档的标题
        /// </summary>
        /// <param name="element"></param>
        /// <returns></returns>
        public static string GetWebTitle(IEnumerable<HtmlElement> element)
        {
            var e = element.FirstOrDefault(x => x.TagName == "title");
            return e != null ? HttpUtility.HtmlDecode(e.InnerText.Trim()) : string.Empty;
        }
    }
}