Spaces:
Sleeping
Sleeping
File size: 2,362 Bytes
97f9138 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | import requests
from bs4 import BeautifulSoup
from typing import List, Dict
import re
from src.config import config
class TextProcessor:
def __init__(self, chunk_size: int = None, chunk_overlap: int = None):
self.chunk_size = chunk_size or config.CHUNK_SIZE
self.chunk_overlap = chunk_overlap or config.CHUNK_OVERLAP
def fetch_webpage(self, url: str, timeout: int = 30) -> str:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
return response.text
def clean_html(self, html: str) -> str:
soup = BeautifulSoup(html, 'html.parser')
for script in soup(["script", "style", "nav", "footer", "header", "aside"]):
script.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def chunk_text(self, text: str) -> List[Dict[str, any]]:
if not text:
return []
words = text.split()
chunks = []
i = 0
chunk_id = 0
while i < len(words):
chunk_words = words[i:i + self.chunk_size]
chunk_text = ' '.join(chunk_words)
chunks.append({
"id": chunk_id,
"text": chunk_text,
"start_word": i,
"end_word": min(i + self.chunk_size, len(words))
})
i += self.chunk_size - self.chunk_overlap
chunk_id += 1
return chunks
def process_url(self, url: str) -> Dict:
html = self.fetch_webpage(url)
clean_text = self.clean_html(html)
chunks = self.chunk_text(clean_text)
return {
"url": url,
"full_text": clean_text,
"chunks": chunks,
"chunk_count": len(chunks),
"word_count": len(clean_text.split())
}
text_processor = TextProcessor()
|