MCP-Utilities / Purify.py
Hyphonical's picture
✨ Update PurifyHtml function: modify model loading to use 'jinaai/jina-readerLM-1.5B' with torch support, and enhance markdown conversion process.
cac9909
raw
history blame
3.74 kB
from transformers import AutoTokenizer, AutoModelForCausalLM
from bs4 import BeautifulSoup, Tag
import datetime
import requests
import torch
import re
NoisePatterns = {
'(No)Script': r'<[ ]*(script|noscript)[^>]*?>.*?<\/[ ]*\1[ ]*>',
'Style': r'<[ ]*(style)[^>]*?>.*?<\/[ ]*\1[ ]*>',
'Svg': r'<[ ]*(svg)[^>]*?>.*?<\/[ ]*\1[ ]*>',
'Meta+Link': r'<[ ]*(meta|link)[^>]*?[\/]?[ ]*>',
'Comment': r'<[ ]*!--.*?--[ ]*>',
'Base64Img': r'<[ ]*img[^>]+src="data:image\/[^;]+;base64,[^"]+"[^>]*[\/]?[ ]*>',
'DocType': r'<!(DOCTYPE|doctype)[ ]*[a-z]*>',
'DataAttributes': r'[ ]+data-[\w-]+="[^"]*"',
'Classes': r'[ ]+class="[^"]*"',
'EmptyAttributes': r'[ ]+[a-z-]+=""',
'DateTime': r'[ ]+datetime="[^"]*"',
'EmptyTags': r'(?:<[ ]*([a-z]{1,10})[^>]*>[ \t\r\n]*){1,5}(?:<\/[ ]*\1[ ]*>){1,5}',
'EmptyLines': r'^[ \t]*\r?\n',
}
def RemoveNoise(RawHtml: str) -> str:
'''Remove noise from HTML content.
Args:
RawHtml (str): The raw HTML content.
Returns:
str: Cleaned HTML content without noise.
'''
CleanedHtml = RawHtml
for PatternName, Pattern in NoisePatterns.items():
if PatternName in ['EmptyLines', 'EmptyTags']: # These patterns are line-based
CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.MULTILINE)
else:
CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.DOTALL | re.IGNORECASE | re.MULTILINE)
return CleanedHtml
def FetchHtmlContent(Url: str) -> str | int:
'''Fetch HTML content from a URL.
Args:
Url (str): The URL to fetch HTML content from.
Returns:
str: The raw HTML content.
'''
Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
Response = requests.get(Url, headers=Headers)
if Response.status_code == 200:
return Response.text
else:
return Response.status_code
def PurifyHtml(Url: str) -> str: # type: ignore
Start = datetime.datetime.now()
RawHtml = FetchHtmlContent(Url)
if isinstance(RawHtml, str):
RawCharCount = len(RawHtml)
Soup = BeautifulSoup(RawHtml, 'html.parser')
PrettifiedHtml = str(Soup.prettify())
Title = Soup.title.string if Soup.title else 'No title found'
MetaDesc = Soup.find('meta', attrs={'name': 'description'})
Description = MetaDesc.get('content', 'No description found') if isinstance(MetaDesc, Tag) else 'No description found'
CleanedHtml = RemoveNoise(PrettifiedHtml)
CleanedCharCount = len(CleanedHtml)
Ratio = CleanedCharCount / RawCharCount if RawCharCount > 0 else 0
Summary = [
'<!-- --- Purification Summary ---',
f'URL: {Url}',
f'Title: {Title}',
f'Description: {Description}',
f'Time of Fetch: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} (Took {datetime.datetime.now() - Start})',
f'Noise Removal Ratio: {Ratio:.2%} (lower is better)',
f'Characters: {RawCharCount} -> {CleanedCharCount} ({RawCharCount - CleanedCharCount} characters removed)',
'----------------------------- -->'
]
for Line in Summary:
print(Line)
Tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-readerLM-1.5B')
Model = AutoModelForCausalLM.from_pretrained('jinaai/jina-readerLM-1.5B', torch_dtype=torch.float32, device_map='cpu')
Prompt = f'Convert this HTML to markdown:\n\n{CleanedHtml}'
Inputs = Tokenizer(Prompt, return_tensors='pt', truncation=True, max_length=8192)
Outputs = Model.generate(Inputs.input_ids, max_new_tokens=8192, do_sample=False)
SummaryOutput = Tokenizer.decode(Outputs[0], skip_special_tokens=True)
return SummaryOutput[len(Prompt):].strip()
else:
print(f'Failed to fetch HTML content. Status code: {RawHtml}')