Web_Scrapping_Agent / pipeline /answer_generator.py
ShivanshCodex's picture
Upload 47 files
f085180 verified
# pipeline/answer_generator.py
import streamlit as st
import google.generativeai as genai
from collections import defaultdict
import math
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def score_documents_with_gemini(docs, user_query):
"""
Uses Gemini to rate how relevant each document is to the user's query (scale 1–5).
"""
model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
scored_docs = []
for i, doc in enumerate(docs):
url = doc.metadata.get("url", "No URL")
snippet = doc.page_content[:500]
prompt = f"""
You are a relevance evaluator assistant.
Given the user's query and a document snippet, score how relevant this document is (1 to 5 scale):
User Query:
\"\"\"{user_query}\"\"\"
Document Snippet:
\"\"\"{snippet}\"\"\"
Return a single number (1-5):
"""
try:
response = model.generate_content(prompt)
score = int("".join(filter(str.isdigit, response.text.strip())))
except Exception as e:
logger.warning(f"Scoring failed for URL {url}: {e}")
score = 3 # Default fallback score
scored_docs.append({
"doc": doc,
"url": url,
"score": score
})
scored_docs.sort(key=lambda x: x["score"], reverse=True)
return scored_docs
def get_diverse_documents(scored_docs, min_required):
"""
Ensure documents are picked from diverse URLs.
"""
by_url = defaultdict(list)
for entry in scored_docs:
by_url[entry["url"]].append(entry)
diverse_docs = []
for entries in by_url.values():
if len(diverse_docs) >= min_required:
break
diverse_docs.append(entries[0])
return diverse_docs
def generate_answer(user_query, retriever, scraped_results):
"""
Final stage β€” Synthesizes a markdown answer using top documents and Gemini.
"""
try:
raw_docs = retriever.get_relevant_documents(user_query)
scored_docs = score_documents_with_gemini(raw_docs, user_query)
min_required = max(1, math.ceil(0.7 * len(scraped_results)))
selected_docs = get_diverse_documents(scored_docs, min_required)
context = ""
unique_sources = {}
for i, entry in enumerate(selected_docs):
doc = entry["doc"]
url = doc.metadata.get("url", "No URL Provided")
if url not in unique_sources:
unique_sources[url] = f"Source {len(unique_sources) + 1}"
label = unique_sources[url]
context += f"[{label}] ({url}):\n{doc.page_content}\n\n"
# Generate markdown report
prompt = f"""
You are a senior research analyst generating professional and comprehensive reports based on the provided source documents.
Your task is to synthesize a detailed, structured, and insight-rich report in Markdown format **based strictly on the content below**.
---
### Instructions:
- Use markdown headings (##, ###) for report sections.
- Use bullet points only where necessary. Prefer **paragraphs with facts**.
- If **tabular or comparative data** (e.g., pricing, features, plans, specs, performance) is mentioned in any source, format it as a **Markdown table** with headers.
- Do not skip important numerical, plan, or policy info just because it's complex β€” break it down in tables or bulleted blocks as needed.
- Use inline citations like [Source 1], [Source 2], etc. after each fact.
- You MUST incorporate information from **at least 3 different sources**.
- If some sources contain overlapping content, reference each one explicitly.
- Avoid relying on just one source unless it's the only one with that information.
- If you cannot extract anything unique from a source, mention this in the "Source Coverage" section at the end.
---
### Output Format:
- Clean, structured Markdown
- Use clear section titles like `## Overview`, `## Key Pricing Details`, `## Feature Comparison`, `## Industry Use Cases`, etc.
- Include at least **one table** if the data structure allows.
- End with a **Source Coverage Summary** explaining which sources were used and how.
---
### SOURCE DOCUMENTS:
{context}
---
### USER QUERY:
\"\"\"{user_query}\"\"\"
Please begin.
"""
model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
response = model.generate_content(prompt)
answer = response.text.strip()
st.markdown("### 🧠 Answer")
if any(r.get("page", 1) > 1 for r in scraped_results):
st.markdown("<span style='color:orange; font-weight:bold;'>⚠️ Includes content from deeper Google search pages</span>", unsafe_allow_html=True)
st.markdown(answer, unsafe_allow_html=True)
with st.expander("πŸ“Š Document Relevance Ranking"):
for entry in scored_docs:
url = entry["url"]
score = entry["score"]
label = unique_sources.get(url, "-")
rating = "High" if score >= 4 else "Medium" if score == 3 else "Low"
st.markdown(f"<small>πŸ”Ή **{label}** | {rating} Relevance | Score: {score}/5 β€” <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)
with st.expander("πŸ”— Source Citations"):
for url, label in unique_sources.items():
st.markdown(f"<small>πŸ”Ή **{label}**: <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)
except Exception as e:
st.error(f"Failed to generate answer: {e}")
logger.exception("Gemini synthesis failed.")
# import streamlit as st
# import google.generativeai as genai
# from collections import defaultdict
# import math
# from google.api_core.exceptions import ResourceExhausted
# def score_documents_with_gemini(docs, user_query):
# model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
# scored_docs = []
# for i, doc in enumerate(docs):
# url = doc.metadata.get("url", "No URL")
# snippet = doc.page_content[:500]
# prompt = f"""
# You are a relevance evaluator assistant.
# Given the user's query and a document snippet, score how relevant this document is (1 to 5 scale):
# User Query:
# \"\"\"{user_query}\"\"\"
# Document Snippet:
# \"\"\"{snippet}\"\"\"
# Return a single number (1-5):
# """
# try:
# response = model.generate_content(prompt)
# score = int("".join(filter(str.isdigit, response.text.strip())))
# except:
# score = 3 # fallback
# scored_docs.append({
# "doc": doc,
# "url": url,
# "score": score
# })
# scored_docs.sort(key=lambda x: x["score"], reverse=True)
# return scored_docs
# def get_diverse_documents(scored_docs, min_required):
# by_url = defaultdict(list)
# for entry in scored_docs:
# by_url[entry["url"]].append(entry)
# diverse_docs = []
# for entries in by_url.values():
# if len(diverse_docs) >= min_required:
# break
# diverse_docs.append(entries[0]) # take top scoring for each unique URL
# return diverse_docs
# def generate_answer(user_query, retriever, scraped_results):
# raw_docs = retriever.get_relevant_documents(user_query)
# scored_docs = score_documents_with_gemini(raw_docs, user_query)
# min_required = max(1, int(0.7 * len(scraped_results)))
# selected_docs = get_diverse_documents(scored_docs, min_required=math.ceil(0.7 * len(scraped_results)))
# context = ""
# unique_sources = {}
# for i, entry in enumerate(selected_docs):
# doc = entry["doc"]
# url = doc.metadata.get("url", "No URL Provided")
# if url not in unique_sources:
# unique_sources[url] = f"Source {len(unique_sources) + 1}"
# label = unique_sources[url]
# context += f"[{label}] ({url}):\n{doc.page_content}\n\n"
# model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
# try:
# response = model.generate_content(prompt)
# answer = response.text.strip()
# except ResourceExhausted as e:
# st.error("🚫 Gemini quota limit exceeded. Please upgrade your plan or try again tomorrow.\n\nDetails: You’ve hit the free-tier limit of 25 requests/day for this model.")
# return
# except Exception as e:
# st.error(f"❌ An unexpected error occurred while generating the answer:\n\n{str(e)}")
# return
# st.markdown("### 🧠 Answer")
# if any(r.get("page", 1) > 1 for r in scraped_results):
# st.markdown("<span style='color:orange; font-weight:bold;'>⚠️ Includes content from deeper Google search pages</span>", unsafe_allow_html=True)
# st.markdown(answer, unsafe_allow_html=True)
# # Relevance Table UI
# with st.expander("πŸ“Š Document Relevance Ranking"):
# for entry in scored_docs:
# url = entry["url"]
# score = entry["score"]
# label = unique_sources.get(url, "-")
# rating = "High" if score >= 4 else "Medium" if score == 3 else "Low"
# st.markdown(f"<small>πŸ”Ή **{label}** | {rating} Relevance | Score: {score}/5 β€” <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)
# # Final Source List
# with st.expander("πŸ”— Source Citations"):
# if unique_sources:
# for url, label in unique_sources.items():
# st.markdown(f"<small>πŸ”Ή **{label}**: <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)
# else:
# st.markdown("<small>No sources available.</small>", unsafe_allow_html=True)