Spaces:

neel692
/

ChatWithDoc

Sleeping

App Files Files Community

ChatWithDoc / webHandler.py

NeelTA

initial commit

d2fe6cc 5 months ago

raw

history blame contribute delete

4.26 kB

	import requests
	from bs4 import BeautifulSoup
	from typing import Dict, Any

	class WebProcessor:
	def __init__(self):
	self.content = ""
	self.url = ""

	def process_url(self, url: str) -> Dict[str, Any]:
	"""Process a web page URL"""
	try:
	# Set headers to mimic a real browser
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'Connection': 'keep-alive',
	}

	# Fetch the webpage
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	# Parse with BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove unwanted elements
	for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'advertisement']):
	element.decompose()

	# Extract text content
	text_content = soup.get_text()

	# Clean up the text
	lines = (line.strip() for line in text_content.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text_content = ' '.join(chunk for chunk in chunks if chunk)

	if not text_content.strip():
	return {"status": "error", "message": "No text content could be extracted from the webpage"}

	# Extract title
	title = soup.find('title')
	page_title = title.get_text().strip() if title else "Untitled"

	self.content = text_content.strip()
	self.url = url

	return {
	"status": "success",
	"message": "Web page processed successfully",
	"title": page_title,
	"num_pages": 1,
	"num_chunks": len(text_content.split()) // 100 + 1,
	"word_count": len(text_content.split())
	}

	except requests.exceptions.RequestException as e:
	return {"status": "error", "message": f"Failed to fetch webpage: {str(e)}"}
	except Exception as e:
	return {"status": "error", "message": f"Error processing webpage: {str(e)}"}

	def query_response(self, query: str) -> Dict[str, Any]:
	"""Answer a query about the web content"""
	if not self.content:
	return {"status": "error", "message": "No web content available"}

	try:
	# Simple keyword-based search
	answer = self._search_content(query, self.content)
	return {
	"status": "success",
	"answer": answer
	}
	except Exception as e:
	return {"status": "error", "message": str(e)}

	def get_content(self) -> str:
	"""Get the extracted content"""
	return self.content

	def _search_content(self, query: str, content: str) -> str:
	"""Simple keyword-based search"""
	query_words = query.lower().split()

	# Split content into sentences
	sentences = []
	for sentence in content.split('.'):
	sentence = sentence.strip()
	if len(sentence) > 10: # Filter out very short fragments
	sentences.append(sentence)

	# Find relevant sentences
	relevant_sentences = []
	for sentence in sentences:
	sentence_lower = sentence.lower()
	score = sum(1 for word in query_words if word in sentence_lower)
	if score > 0:
	relevant_sentences.append((sentence, score))

	if not relevant_sentences:
	return "I couldn't find information related to your query on this webpage."

	# Sort by relevance and return top sentences
	relevant_sentences.sort(key=lambda x: x[1], reverse=True)
	top_sentences = [sent[0] for sent in relevant_sentences[:3]]

	return ". ".join(top_sentences)