Spaces:

spamultrapromax
/

WebRAG

Sleeping

WebRAG / src /text_processor.py

Arun21102003

Initial clean commit

97f9138 7 days ago

2.36 kB

	import requests
	from bs4 import BeautifulSoup
	from typing import List, Dict
	import re
	from src.config import config

	class TextProcessor:
	def __init__(self, chunk_size: int = None, chunk_overlap: int = None):
	self.chunk_size = chunk_size or config.CHUNK_SIZE
	self.chunk_overlap = chunk_overlap or config.CHUNK_OVERLAP

	def fetch_webpage(self, url: str, timeout: int = 30) -> str:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}
	response = requests.get(url, headers=headers, timeout=timeout)
	response.raise_for_status()
	return response.text

	def clean_html(self, html: str) -> str:
	soup = BeautifulSoup(html, 'html.parser')

	for script in soup(["script", "style", "nav", "footer", "header", "aside"]):
	script.decompose()

	text = soup.get_text()

	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = ' '.join(chunk for chunk in chunks if chunk)

	text = re.sub(r'\s+', ' ', text)

	return text.strip()

	def chunk_text(self, text: str) -> List[Dict[str, any]]:
	if not text:
	return []

	words = text.split()
	chunks = []

	i = 0
	chunk_id = 0
	while i < len(words):
	chunk_words = words[i:i + self.chunk_size]
	chunk_text = ' '.join(chunk_words)

	chunks.append({
	"id": chunk_id,
	"text": chunk_text,
	"start_word": i,
	"end_word": min(i + self.chunk_size, len(words))
	})

	i += self.chunk_size - self.chunk_overlap
	chunk_id += 1

	return chunks

	def process_url(self, url: str) -> Dict:
	html = self.fetch_webpage(url)

	clean_text = self.clean_html(html)

	chunks = self.chunk_text(clean_text)

	return {
	"url": url,
	"full_text": clean_text,
	"chunks": chunks,
	"chunk_count": len(chunks),
	"word_count": len(clean_text.split())
	}

	text_processor = TextProcessor()