2012年8月27日 星期一

移除所有HTML tag + 移除 JavaScript

/// <summary>
/// 移除html tag
/// </summary>
/// <param name="htmlSource"></param>
/// <returns></returns>
public static string RemoveHTMLTag(string htmlSource)
{
    //移除  javascript code.
    htmlSource = Regex.Replace(htmlSource, @"<script[\d\D]*?>[\d\D]*?</script>", String.Empty);
    
    //移除html tag.
    htmlSource = Regex.Replace(htmlSource, @"<[^>]*>", String.Empty);
    return htmlSource;
}
 
/// <summary>
/// 從網路上取得原始碼
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetSourceFromUrl(string url)
{
 
    WebClient client = new WebClient();
 
    //以防萬一 模擬自己為瀏覽器
    client.Headers.Add("User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5");
    client.Headers.Add("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
    client.Headers.Add("Accept-Encoding: identity");
    client.Headers.Add("Accept-Language: zh-TW,en;q=0.8");
    client.Headers.Add("Accept-Charset: utf-8;q=0.7,*;q=0.3");
    client.Headers.Add("ContentType", "application/x-www-form-urlencoded");
    client.Encoding = Encoding.UTF8;
    return client.DownloadString(url);
}


使用:

protected void btnTest_Click(object sender, EventArgs e)
{
    ltlResult.Text = RemoveHTMLTag(GetSourceFromUrl(this.txtLink.Text));
}



原始文章出自於此

沒有留言:

張貼留言