WebRAG / src /text_processor.py
Arun21102003
Initial clean commit
97f9138
import requests
from bs4 import BeautifulSoup
from typing import List, Dict
import re
from src.config import config
class TextProcessor:
def __init__(self, chunk_size: int = None, chunk_overlap: int = None):
self.chunk_size = chunk_size or config.CHUNK_SIZE
self.chunk_overlap = chunk_overlap or config.CHUNK_OVERLAP
def fetch_webpage(self, url: str, timeout: int = 30) -> str:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
return response.text
def clean_html(self, html: str) -> str:
soup = BeautifulSoup(html, 'html.parser')
for script in soup(["script", "style", "nav", "footer", "header", "aside"]):
script.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def chunk_text(self, text: str) -> List[Dict[str, any]]:
if not text:
return []
words = text.split()
chunks = []
i = 0
chunk_id = 0
while i < len(words):
chunk_words = words[i:i + self.chunk_size]
chunk_text = ' '.join(chunk_words)
chunks.append({
"id": chunk_id,
"text": chunk_text,
"start_word": i,
"end_word": min(i + self.chunk_size, len(words))
})
i += self.chunk_size - self.chunk_overlap
chunk_id += 1
return chunks
def process_url(self, url: str) -> Dict:
html = self.fetch_webpage(url)
clean_text = self.clean_html(html)
chunks = self.chunk_text(clean_text)
return {
"url": url,
"full_text": clean_text,
"chunks": chunks,
"chunk_count": len(chunks),
"word_count": len(clean_text.split())
}
text_processor = TextProcessor()