ChatWithDoc / webHandler.py
NeelTA's picture
initial commit
d2fe6cc
import requests
from bs4 import BeautifulSoup
from typing import Dict, Any
class WebProcessor:
def __init__(self):
self.content = ""
self.url = ""
def process_url(self, url: str) -> Dict[str, Any]:
"""Process a web page URL"""
try:
# Set headers to mimic a real browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
# Fetch the webpage
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Parse with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Remove unwanted elements
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'advertisement']):
element.decompose()
# Extract text content
text_content = soup.get_text()
# Clean up the text
lines = (line.strip() for line in text_content.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text_content = ' '.join(chunk for chunk in chunks if chunk)
if not text_content.strip():
return {"status": "error", "message": "No text content could be extracted from the webpage"}
# Extract title
title = soup.find('title')
page_title = title.get_text().strip() if title else "Untitled"
self.content = text_content.strip()
self.url = url
return {
"status": "success",
"message": "Web page processed successfully",
"title": page_title,
"num_pages": 1,
"num_chunks": len(text_content.split()) // 100 + 1,
"word_count": len(text_content.split())
}
except requests.exceptions.RequestException as e:
return {"status": "error", "message": f"Failed to fetch webpage: {str(e)}"}
except Exception as e:
return {"status": "error", "message": f"Error processing webpage: {str(e)}"}
def query_response(self, query: str) -> Dict[str, Any]:
"""Answer a query about the web content"""
if not self.content:
return {"status": "error", "message": "No web content available"}
try:
# Simple keyword-based search
answer = self._search_content(query, self.content)
return {
"status": "success",
"answer": answer
}
except Exception as e:
return {"status": "error", "message": str(e)}
def get_content(self) -> str:
"""Get the extracted content"""
return self.content
def _search_content(self, query: str, content: str) -> str:
"""Simple keyword-based search"""
query_words = query.lower().split()
# Split content into sentences
sentences = []
for sentence in content.split('.'):
sentence = sentence.strip()
if len(sentence) > 10: # Filter out very short fragments
sentences.append(sentence)
# Find relevant sentences
relevant_sentences = []
for sentence in sentences:
sentence_lower = sentence.lower()
score = sum(1 for word in query_words if word in sentence_lower)
if score > 0:
relevant_sentences.append((sentence, score))
if not relevant_sentences:
return "I couldn't find information related to your query on this webpage."
# Sort by relevance and return top sentences
relevant_sentences.sort(key=lambda x: x[1], reverse=True)
top_sentences = [sent[0] for sent in relevant_sentences[:3]]
return ". ".join(top_sentences)