用于html解析,这个一个element是html中的一个节点,包含若干个子节点
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Web;
using ZmjTool;
namespace ZmjConvert
{
/// <summary>
/// 用于html解析,这个一个element是html中的一个节点,包含若干个子节点
/// </summary>
public class HtmlElement : CHtmlElement<HtmlElement>
{
/// <summary>
/// 某一系列tag单元的att属性比如所有的a的href
/// </summary>
/// <param name="tag"></param>
/// <param name="att"></param>
/// <returns></returns>
public IEnumerable<string> this[string tag, string att]
{
get
{
return this[tag].Where(x => x.Attributes.ContainsKey(att)).Select(x => x.Attributes[att]);
}
}
/// <summary>
/// 获取当前对象的所有子内容的文本,不包含标签信息
/// </summary>
/// <param name="e"></param>
/// <returns></returns>
public string GetInnerText()
{
if (Childrens.Count < 1) return InnerText;
var sb = new StringBuilder();
foreach (var item in Childrens)
{
sb.Append(item.InnerText);
}
return sb.ToString();
}
/// <summary>
/// 从ele中提取出所有的子ele
/// </summary>
/// <param name="eles"></param>
/// <returns></returns>
public static IEnumerable<HtmlElement> GetAllElement(IEnumerable<HtmlElement> ele)
{
var aes = new List<HtmlElement>();
foreach (var item in ele.Where(x => x.TagName != "style" && x.TagName != "!--"))
{
aes.Add(item);
aes.AddRange(GetAllElement(item.Children.ToArray()));
}
return aes;
}
/// <summary>
/// 获取所有的link,即a的href的内容
/// </summary>
/// <param name="baseUrl"></param>
/// <returns></returns>
public static IEnumerable<Uri> GetAllLink(IEnumerable<HtmlElement> eles, Uri srcurl)
{
var lks = eles.Where(x => x.TagName == "a" && x.Attributes.ContainsKey("href")).Select(x => new Uri(srcurl, HttpUtility.HtmlDecode(x.Attributes["href"]))).Where(x => x != null);
return lks.Distinct(new HttpTool.UriEquasComp());
}
/// <summary>
/// 获取所有的网页上的图片的连接
/// </summary>
/// <returns></returns>
public static Dictionary<Uri, string> GetAllImageSrc(IEnumerable<HtmlElement> ele, Uri srcurl)
{
var imgs = ele.Where(x => x.TagName == "img");
var array = new Dictionary<Uri, string>();
foreach (var item in imgs)
{
var alt = item.Attributes.ContainsKey("alt") ? item.Attributes["alt"] : item.Attributes.ContainsKey("title") ? item.Attributes["title"] : string.Empty;
foreach (var src in item.Attributes.Where(x => x.Key.StartsWith("data-") || x.Key.StartsWith("src")).Select(x => x.Value))
{
var srcc = new Uri(srcurl, HttpUtility.HtmlDecode(src));
if (srcc != null && !array.ContainsKey(srcc)) array.Add(srcc, alt);
}
}
return array.Distinct(new HttpTool.DicStringUrlEquasComp()).ToDictionary(x => x.Key, y => y.Value);
}
/// <summary>
/// 获取所有的media的连接
/// </summary>
/// <returns></returns>
public static Dictionary<Uri, bool> GetAllMediaSrc(IEnumerable<HtmlElement> ele, Uri srcurl)
{
List<Uri> lv = new List<Uri>();
var vids = new Dictionary<Uri, bool>();
foreach (var v in eles.Where(x => x.Tag == "video" || x.Tag == "source"))
{
var l = v.Attributes.Where(x => x.Key.StartsWith("data-") || x.Key.StartsWith("src"));
lv.AddRange(l.Select(x => new Uri(srcurl, HttpUtility.HtmlDecode(x.Value))).Where(x => x != null));
}
foreach (var item in lv.Distinct(new HttpTool.UriEquasComp()))
{
vids.Add(item, false);
}
foreach (var item in eles.Where(x => x.Tag == "script" && !string.IsNullOrWhiteSpace(x.InnerText)))
{
foreach (var uri in GetM3U8FromScript(item.InnerText, srcurl))
{
vids.Add(uri, true);
}
}
return vids;
}
/// <summary>
/// 尝试获取视频的大小,如果可能的话还会有一个内部连接,导向其他的m3u8文件
/// </summary>
/// <param name="content"></param>
/// <param name="w"></param>
/// <param name="h"></param>
/// <param name="contenturl"></param>
/// <returns></returns>
public static bool GetM3U8VideoSize(Uri src, string content, out int w, out int h, out TimeSpan time, Func<Uri, string> getcontent)
{
try
{
var dc = new M3U8Document(content, src);
w = dc.Width;
h = dc.Heigth;
time = TimeSpan.FromSeconds(dc.UriList.Sum(x => x.Info));
if (dc.OtherLine.FirstOrDefault() is string ourl)
{//获取第一个字内容的内容
try
{//获取子内容的内容
var ouri = new Uri(src, ourl);
ourl = getcontent.Invoke(ouri);
var ndc = new M3U8Document(ourl, ouri);
w += ndc.Width;
h += ndc.Heigth;
time += TimeSpan.FromSeconds(ndc.UriList.Sum(x => x.Info));
}
catch (Exception) { }
}
return true;
}
catch (Exception)
{
w = 0;
h = 0;
time = TimeSpan.Zero;
return false;
}
}
/// <summary>
/// 从script中找出m3u8视频
/// </summary>
/// <param name="script"></param>
/// <returns></returns>
public static IEnumerable<Uri> GetM3U8FromScript(string script, Uri src)
{
List<Uri> uris = new List<Uri>();
string url;
int idx = 0;
while ((url = FindM3U8At(script, ref idx)) != null)
{
try
{
uris.Add(new Uri(src, url));
}
catch (Exception)
{
continue;
}
}
return uris;
}
/// <summary>
/// 从指定位置查找m3u8视频链接
/// </summary>
/// <param name="src"></param>
/// <param name="idx"></param>
/// <returns></returns>
public static string FindM3U8At(string src, ref int idx)
{
if (idx >= src.Length) return null;//查找.m3u8标记
var st = src.IndexOf(".m3u8", idx, StringComparison.OrdinalIgnoreCase);
if (st < 0) return null;//没找到时
var sst = src.LastIndexOf("http", st, StringComparison.OrdinalIgnoreCase);//查找http头
if(sst < 0)
{//没找到http头标记时可能是假的
idx = st;
return FindM3U8At(src, ref idx);
}//查找一个结尾标记,这是因为有些可能是m3u8?...这种形式
var ed = src.IndexOfAny(" '\"(){}[]|!~^".ToCharArray(), st);
ed = ed > 0 ? ed : sst + 5;
var v = src.Substring(sst, ed - sst);
idx = ed + 1;//有的网站他就这么做了,emm
return v.Replace("\\", string.Empty);
}
/// <summary>
/// 尝试将string序列化为class数组
/// </summary>
/// <param name="htmlstr"></param>
/// <param name="eles"></param>
/// <param name="message">输出序列化中遇到的错误提示</param>
/// <returns></returns>
public static bool TryParse(string htmlstr, out IEnumerable<HtmlElement> eles, out string message)
{
try
{
htmlstr = Regex.Replace(htmlstr, @"[\f\n\r\t\v]", string.Empty);
eles = Parse(htmlstr);
message = string.Empty;
return true;
}
catch (Exception e)
{
eles = null;
message = "html文档解析异常:" + e.Message;
return false;
}
}
/// <summary>
/// 获取所有的inner字符
/// </summary>
/// <param name="element"></param>
/// <returns></returns>
public static string GetAllInnerText(IEnumerable<HtmlElement> element)
{
StringBuilder sb = new StringBuilder();
foreach (var item in element.Where(x => x.TagName != "script" && x.TagName != "style" && x.TagName != "!--" && x.TagName != "title"))
{
sb.Append(HttpUtility.HtmlDecode(item.InnerText.Trim()));
}
return sb.ToString();
}
/// <summary>
/// 获取可能存在的编码指示
/// </summary>
/// <param name="element"></param>
/// <returns></returns>
public new static Encoding GetEncoding(IEnumerable<HtmlElement> element)
{
string ce = null;
var e = element.FirstOrDefault(x => x.Attributes.ContainsKey("charset"));
if (e != null) ce = e.Attributes["charset"];
e = element.FirstOrDefault(x => x.Attributes.ContainsKey("http-equiv") && x.Attributes["http-equiv"].ToLower() == "content-type" && x.Attributes.ContainsKey("content"));
if (e != null)
{
var cce = e.Attributes["content"].ToLower();
var st = cce.IndexOf("charset=");
var ed = cce.IndexOfAny(" ;\"\'".ToArray(), st + 8);
if (st >= 0) ce = cce.Substring(st + 8, ed < 0 ? cce.Length - st - 8 : ed - st - 8);
}
if (string.IsNullOrWhiteSpace(ce)) return null;
//var eds = Encoding.GetEncodings().Select(x => x.Name).ToArray();
//return eds.Contains(ce) ? Encoding.GetEncoding(ce) : null;
try { return Encoding.GetEncoding(ce); }
catch (Exception) { return null; }
}
/// <summary>
/// 查找element,找出指定tag和属性name为指定值的ele的指定的attr
/// </summary>
/// <param name="tag"></param>
/// <param name="filtername"></param>
/// <param name="attrname"></param>
/// <returns></returns>
public static string GetElementAttr(IEnumerable<HtmlElement> element, string tag, string filtername, string attrname)
{
var e = element.FirstOrDefault(x => x.TagName == tag && x.Attributes.ContainsKey("name") && x.Attributes["name"].ToLower() == filtername && x.Attributes.ContainsKey(attrname));
return e?.Attributes[attrname] ?? string.Empty;
}
/// <summary>
/// 查找出网站的图标连接
/// </summary>
/// <param name="element"></param>
/// <returns></returns>
public static Uri GetWebIcon(IEnumerable<HtmlElement> element, Uri srcurl)
{
var e = element.FirstOrDefault(x => x.TagName == "link" && x.Attributes.ContainsKey("rel") && x.Attributes.ContainsKey("href") && (x.Attributes["rel"].ToLower().Contains("icon") || x.Attributes["rel"].ToLower().Contains("shortcut")));
return new Uri(srcurl, HttpUtility.HtmlDecode(e?.Attributes["href"] ?? string.Empty));
}
/// <summary>
/// 获取文档的标题
/// </summary>
/// <param name="element"></param>
/// <returns></returns>
public static string GetWebTitle(IEnumerable<HtmlElement> element)
{
var e = element.FirstOrDefault(x => x.TagName == "title");
return e != null ? HttpUtility.HtmlDecode(e.InnerText.Trim()) : string.Empty;
}
}
}