Spaces:

Shriharsh
/

Web_Content_QA

Build error

App Files Files Community

Shriharsh commited on Mar 21, 2025

Commit

524057e

verified ·

1 Parent(s): 9121798

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -64

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # Web Content Q&A Tool for Hugging Face Spaces
 # Optimized for memory constraints (2GB RAM) and 24-hour timeline
-# Features: Ingest up to 3 URLs, ask questions, get concise one-line answers using DistilBERT with PyTorch
-# Includes keyword search fallback for low-confidence QA answers
 import gradio as gr
 from bs4 import BeautifulSoup
@@ -32,20 +31,20 @@ corpus = []  # List of paragraphs from URLs
 embeddings = None  # Precomputed embeddings for retrieval
 sources_list = []  # Source URLs for each paragraph
-# Load models at startup (memory: ~370MB total)
-# Retrieval model: all-mpnet-base-v2 (~110MB, 768-dim embeddings)
-retriever = SentenceTransformer('all-mpnet-base-v2')
 # Load PyTorch model for QA
-# Model: distilbert-base-uncased-distilled-squad (~260MB)
 try:
-    model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")
-    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
 except Exception as e:
     print(f"Error loading model: {str(e)}. Retrying with force_download=True...")
     # Force re-download in case of corrupted cache
-    model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad", force_download=True)
-    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad", force_download=True)
 # Set model to evaluation mode
 model.eval()
@@ -68,38 +67,6 @@ def truncate_to_one_line(text):
         first_sentence = first_sentence[:100].rsplit(' ', 1)[0] + "..."
     return first_sentence if first_sentence else "No answer available."
-# Keyword search function for fallback
-def keyword_search(question, corpus, sources_list):
-    stop_words = set(["what", "is", "the", "a", "an", "in", "on", "at", "for", "with", "and", "or", "but", "not", "this", "that", "these", "those", "to", "of", "it", "by", "as", "if", "when", "where", "who", "which", "how", "why"])
-    def clean_text(text):
-        return re.sub(r'[^a-zA-Z\s]', '', text).lower()
-    cleaned_question = clean_text(question)
-    keywords = [word for word in cleaned_question.split() if word not in stop_words]
-    if not keywords:
-        return "No keywords found for search.", None
-    best_paragraph = None
-    best_count = 0
-    best_source = None
-    for i, para in enumerate(corpus):
-        cleaned_para = clean_text(para)
-        words = set(cleaned_para.split())  # Use set for faster lookup
-        count = sum(1 for kw in keywords if kw in words)
-        if count > best_count:
-            best_count = count
-            best_paragraph = para
-            best_source = sources_list[i]
-    if best_paragraph is None:
-        return "No relevant paragraph found.", None
-    # Truncate the paragraph to one line
-    best_paragraph = truncate_to_one_line(best_paragraph)
-    return best_paragraph, best_source
 def ingest_urls(urls):
     """
     Ingest up to 3 URLs, scrape content, and compute embeddings.
@@ -149,11 +116,10 @@ def ingest_urls(urls):
 def answer_question(question):
     """
-    Answer a question using retrieved context and DistilBERT QA (PyTorch).
     Retrieves top 3 paragraphs to improve answer accuracy.
-    If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
-    If QA confidence is below 0.4, falls back to keyword search.
-    Ensures answers are one line (max 100 chars).
     """
     global corpus, embeddings, sources_list
     if not corpus or embeddings is None:
@@ -164,35 +130,33 @@ def answer_question(question):
     # Compute cosine similarity with stored embeddings
     cos_scores = util.cos_sim(question_embedding, embeddings)[0]
-    top_k = min(2, len(corpus))  # Get top 3 paragraphs to improve accuracy
     top_indices = np.argsort(-cos_scores)[:top_k]
-    # Retrieve context (top 2 paragraphs)
     contexts = [corpus[i] for i in top_indices]
     context = " ".join(contexts)  # Concatenate with space
     sources = [sources_list[i] for i in top_indices]
-    # Extract answer with DistilBERT (PyTorch)
     with torch.no_grad():  # Disable gradient computation for faster inference
         result = qa_model(question=question, context=context)
     answer = result['answer']
     confidence = result['score']
-    if confidence >= 0.4:
-        # Truncate QA answer to one line
-        answer = truncate_to_one_line(answer)
-        # Ensure at least one line
-        if not answer:
-            answer = "No answer available."
-        sources_str = "\n".join(set(sources))  # Unique sources
-        return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
-    else:
-        # Perform keyword search
-        kw_answer, kw_source = keyword_search(question, corpus, sources_list)
-        if kw_source:
-            return f"Answer: {kw_answer} (from keyword search, as QA confidence was {confidence:.2f})\nSource: {kw_source}"
-        else:
-            return "No relevant answer found from keyword search."
 def clear_all():
     """Clear all inputs and outputs for a fresh start."""

 # Web Content Q&A Tool for Hugging Face Spaces
 # Optimized for memory constraints (2GB RAM) and 24-hour timeline
+# Features: Ingest up to 3 URLs, ask questions, get concise one-line answers using RoBERTa with PyTorch
 import gradio as gr
 from bs4 import BeautifulSoup
 embeddings = None  # Precomputed embeddings for retrieval
 sources_list = []  # Source URLs for each paragraph
+# Load models at startup (memory: ~410MB total)
+# Retrieval model: multi-qa-mpnet-base-dot-v1 (~110MB, 768-dim embeddings)
+retriever = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
 # Load PyTorch model for QA
+# Model: roberta-base-squad2 (~355MB, quantized to ~200-250MB)
 try:
+    model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
+    tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
 except Exception as e:
     print(f"Error loading model: {str(e)}. Retrying with force_download=True...")
     # Force re-download in case of corrupted cache
+    model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2", force_download=True)
+    tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2", force_download=True)
 # Set model to evaluation mode
 model.eval()
         first_sentence = first_sentence[:100].rsplit(' ', 1)[0] + "..."
     return first_sentence if first_sentence else "No answer available."
 def ingest_urls(urls):
     """
     Ingest up to 3 URLs, scrape content, and compute embeddings.
 def answer_question(question):
     """
+    Answer a question using retrieved context and RoBERTa QA (PyTorch).
     Retrieves top 3 paragraphs to improve answer accuracy.
+    If total context exceeds 512 tokens (RoBERTa's max length), it will be truncated automatically.
+    Rejects answers with confidence below 0.3. Ensures answers are one line (max 100 chars).
     """
     global corpus, embeddings, sources_list
     if not corpus or embeddings is None:
     # Compute cosine similarity with stored embeddings
     cos_scores = util.cos_sim(question_embedding, embeddings)[0]
+    top_k = min(3, len(corpus))  # Get top 3 paragraphs as preferred
     top_indices = np.argsort(-cos_scores)[:top_k]
+    # Retrieve context (top 3 paragraphs)
     contexts = [corpus[i] for i in top_indices]
     context = " ".join(contexts)  # Concatenate with space
     sources = [sources_list[i] for i in top_indices]
+    # Extract answer with RoBERTa (PyTorch)
     with torch.no_grad():  # Disable gradient computation for faster inference
         result = qa_model(question=question, context=context)
     answer = result['answer']
     confidence = result['score']
+    # Check confidence threshold
+    if confidence < 0.3:
+        return f"No confident answer found (confidence {confidence:.2f} below 0.3)."
+    # Truncate answer to one line
+    answer = truncate_to_one_line(answer)
+    # Ensure at least one line
+    if not answer:
+        answer = "No answer available."
+    # Format response with answer, confidence, and sources
+    sources_str = "\n".join(set(sources))  # Unique sources
+    return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"
 def clear_all():
     """Clear all inputs and outputs for a fresh start."""