Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from typing import List, Dict | |
| import re | |
| from src.config import config | |
| class TextProcessor: | |
| def __init__(self, chunk_size: int = None, chunk_overlap: int = None): | |
| self.chunk_size = chunk_size or config.CHUNK_SIZE | |
| self.chunk_overlap = chunk_overlap or config.CHUNK_OVERLAP | |
| def fetch_webpage(self, url: str, timeout: int = 30) -> str: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=timeout) | |
| response.raise_for_status() | |
| return response.text | |
| def clean_html(self, html: str) -> str: | |
| soup = BeautifulSoup(html, 'html.parser') | |
| for script in soup(["script", "style", "nav", "footer", "header", "aside"]): | |
| script.decompose() | |
| text = soup.get_text() | |
| lines = (line.strip() for line in text.splitlines()) | |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
| text = ' '.join(chunk for chunk in chunks if chunk) | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |
| def chunk_text(self, text: str) -> List[Dict[str, any]]: | |
| if not text: | |
| return [] | |
| words = text.split() | |
| chunks = [] | |
| i = 0 | |
| chunk_id = 0 | |
| while i < len(words): | |
| chunk_words = words[i:i + self.chunk_size] | |
| chunk_text = ' '.join(chunk_words) | |
| chunks.append({ | |
| "id": chunk_id, | |
| "text": chunk_text, | |
| "start_word": i, | |
| "end_word": min(i + self.chunk_size, len(words)) | |
| }) | |
| i += self.chunk_size - self.chunk_overlap | |
| chunk_id += 1 | |
| return chunks | |
| def process_url(self, url: str) -> Dict: | |
| html = self.fetch_webpage(url) | |
| clean_text = self.clean_html(html) | |
| chunks = self.chunk_text(clean_text) | |
| return { | |
| "url": url, | |
| "full_text": clean_text, | |
| "chunks": chunks, | |
| "chunk_count": len(chunks), | |
| "word_count": len(clean_text.split()) | |
| } | |
| text_processor = TextProcessor() | |