import requests from bs4 import BeautifulSoup from typing import List, Dict import re from src.config import config class TextProcessor: def __init__(self, chunk_size: int = None, chunk_overlap: int = None): self.chunk_size = chunk_size or config.CHUNK_SIZE self.chunk_overlap = chunk_overlap or config.CHUNK_OVERLAP def fetch_webpage(self, url: str, timeout: int = 30) -> str: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=timeout) response.raise_for_status() return response.text def clean_html(self, html: str) -> str: soup = BeautifulSoup(html, 'html.parser') for script in soup(["script", "style", "nav", "footer", "header", "aside"]): script.decompose() text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = ' '.join(chunk for chunk in chunks if chunk) text = re.sub(r'\s+', ' ', text) return text.strip() def chunk_text(self, text: str) -> List[Dict[str, any]]: if not text: return [] words = text.split() chunks = [] i = 0 chunk_id = 0 while i < len(words): chunk_words = words[i:i + self.chunk_size] chunk_text = ' '.join(chunk_words) chunks.append({ "id": chunk_id, "text": chunk_text, "start_word": i, "end_word": min(i + self.chunk_size, len(words)) }) i += self.chunk_size - self.chunk_overlap chunk_id += 1 return chunks def process_url(self, url: str) -> Dict: html = self.fetch_webpage(url) clean_text = self.clean_html(html) chunks = self.chunk_text(clean_text) return { "url": url, "full_text": clean_text, "chunks": chunks, "chunk_count": len(chunks), "word_count": len(clean_text.split()) } text_processor = TextProcessor()