Spaces:
Runtime error
Runtime error
File size: 10,333 Bytes
f085180 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | # pipeline/answer_generator.py
import streamlit as st
import google.generativeai as genai
from collections import defaultdict
import math
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def score_documents_with_gemini(docs, user_query):
"""
Uses Gemini to rate how relevant each document is to the user's query (scale 1β5).
"""
model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
scored_docs = []
for i, doc in enumerate(docs):
url = doc.metadata.get("url", "No URL")
snippet = doc.page_content[:500]
prompt = f"""
You are a relevance evaluator assistant.
Given the user's query and a document snippet, score how relevant this document is (1 to 5 scale):
User Query:
\"\"\"{user_query}\"\"\"
Document Snippet:
\"\"\"{snippet}\"\"\"
Return a single number (1-5):
"""
try:
response = model.generate_content(prompt)
score = int("".join(filter(str.isdigit, response.text.strip())))
except Exception as e:
logger.warning(f"Scoring failed for URL {url}: {e}")
score = 3 # Default fallback score
scored_docs.append({
"doc": doc,
"url": url,
"score": score
})
scored_docs.sort(key=lambda x: x["score"], reverse=True)
return scored_docs
def get_diverse_documents(scored_docs, min_required):
"""
Ensure documents are picked from diverse URLs.
"""
by_url = defaultdict(list)
for entry in scored_docs:
by_url[entry["url"]].append(entry)
diverse_docs = []
for entries in by_url.values():
if len(diverse_docs) >= min_required:
break
diverse_docs.append(entries[0])
return diverse_docs
def generate_answer(user_query, retriever, scraped_results):
"""
Final stage β Synthesizes a markdown answer using top documents and Gemini.
"""
try:
raw_docs = retriever.get_relevant_documents(user_query)
scored_docs = score_documents_with_gemini(raw_docs, user_query)
min_required = max(1, math.ceil(0.7 * len(scraped_results)))
selected_docs = get_diverse_documents(scored_docs, min_required)
context = ""
unique_sources = {}
for i, entry in enumerate(selected_docs):
doc = entry["doc"]
url = doc.metadata.get("url", "No URL Provided")
if url not in unique_sources:
unique_sources[url] = f"Source {len(unique_sources) + 1}"
label = unique_sources[url]
context += f"[{label}] ({url}):\n{doc.page_content}\n\n"
# Generate markdown report
prompt = f"""
You are a senior research analyst generating professional and comprehensive reports based on the provided source documents.
Your task is to synthesize a detailed, structured, and insight-rich report in Markdown format **based strictly on the content below**.
---
### Instructions:
- Use markdown headings (##, ###) for report sections.
- Use bullet points only where necessary. Prefer **paragraphs with facts**.
- If **tabular or comparative data** (e.g., pricing, features, plans, specs, performance) is mentioned in any source, format it as a **Markdown table** with headers.
- Do not skip important numerical, plan, or policy info just because it's complex β break it down in tables or bulleted blocks as needed.
- Use inline citations like [Source 1], [Source 2], etc. after each fact.
- You MUST incorporate information from **at least 3 different sources**.
- If some sources contain overlapping content, reference each one explicitly.
- Avoid relying on just one source unless it's the only one with that information.
- If you cannot extract anything unique from a source, mention this in the "Source Coverage" section at the end.
---
### Output Format:
- Clean, structured Markdown
- Use clear section titles like `## Overview`, `## Key Pricing Details`, `## Feature Comparison`, `## Industry Use Cases`, etc.
- Include at least **one table** if the data structure allows.
- End with a **Source Coverage Summary** explaining which sources were used and how.
---
### SOURCE DOCUMENTS:
{context}
---
### USER QUERY:
\"\"\"{user_query}\"\"\"
Please begin.
"""
model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
response = model.generate_content(prompt)
answer = response.text.strip()
st.markdown("### π§ Answer")
if any(r.get("page", 1) > 1 for r in scraped_results):
st.markdown("<span style='color:orange; font-weight:bold;'>β οΈ Includes content from deeper Google search pages</span>", unsafe_allow_html=True)
st.markdown(answer, unsafe_allow_html=True)
with st.expander("π Document Relevance Ranking"):
for entry in scored_docs:
url = entry["url"]
score = entry["score"]
label = unique_sources.get(url, "-")
rating = "High" if score >= 4 else "Medium" if score == 3 else "Low"
st.markdown(f"<small>πΉ **{label}** | {rating} Relevance | Score: {score}/5 β <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)
with st.expander("π Source Citations"):
for url, label in unique_sources.items():
st.markdown(f"<small>πΉ **{label}**: <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)
except Exception as e:
st.error(f"Failed to generate answer: {e}")
logger.exception("Gemini synthesis failed.")
# import streamlit as st
# import google.generativeai as genai
# from collections import defaultdict
# import math
# from google.api_core.exceptions import ResourceExhausted
# def score_documents_with_gemini(docs, user_query):
# model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
# scored_docs = []
# for i, doc in enumerate(docs):
# url = doc.metadata.get("url", "No URL")
# snippet = doc.page_content[:500]
# prompt = f"""
# You are a relevance evaluator assistant.
# Given the user's query and a document snippet, score how relevant this document is (1 to 5 scale):
# User Query:
# \"\"\"{user_query}\"\"\"
# Document Snippet:
# \"\"\"{snippet}\"\"\"
# Return a single number (1-5):
# """
# try:
# response = model.generate_content(prompt)
# score = int("".join(filter(str.isdigit, response.text.strip())))
# except:
# score = 3 # fallback
# scored_docs.append({
# "doc": doc,
# "url": url,
# "score": score
# })
# scored_docs.sort(key=lambda x: x["score"], reverse=True)
# return scored_docs
# def get_diverse_documents(scored_docs, min_required):
# by_url = defaultdict(list)
# for entry in scored_docs:
# by_url[entry["url"]].append(entry)
# diverse_docs = []
# for entries in by_url.values():
# if len(diverse_docs) >= min_required:
# break
# diverse_docs.append(entries[0]) # take top scoring for each unique URL
# return diverse_docs
# def generate_answer(user_query, retriever, scraped_results):
# raw_docs = retriever.get_relevant_documents(user_query)
# scored_docs = score_documents_with_gemini(raw_docs, user_query)
# min_required = max(1, int(0.7 * len(scraped_results)))
# selected_docs = get_diverse_documents(scored_docs, min_required=math.ceil(0.7 * len(scraped_results)))
# context = ""
# unique_sources = {}
# for i, entry in enumerate(selected_docs):
# doc = entry["doc"]
# url = doc.metadata.get("url", "No URL Provided")
# if url not in unique_sources:
# unique_sources[url] = f"Source {len(unique_sources) + 1}"
# label = unique_sources[url]
# context += f"[{label}] ({url}):\n{doc.page_content}\n\n"
# model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
# try:
# response = model.generate_content(prompt)
# answer = response.text.strip()
# except ResourceExhausted as e:
# st.error("π« Gemini quota limit exceeded. Please upgrade your plan or try again tomorrow.\n\nDetails: Youβve hit the free-tier limit of 25 requests/day for this model.")
# return
# except Exception as e:
# st.error(f"β An unexpected error occurred while generating the answer:\n\n{str(e)}")
# return
# st.markdown("### π§ Answer")
# if any(r.get("page", 1) > 1 for r in scraped_results):
# st.markdown("<span style='color:orange; font-weight:bold;'>β οΈ Includes content from deeper Google search pages</span>", unsafe_allow_html=True)
# st.markdown(answer, unsafe_allow_html=True)
# # Relevance Table UI
# with st.expander("π Document Relevance Ranking"):
# for entry in scored_docs:
# url = entry["url"]
# score = entry["score"]
# label = unique_sources.get(url, "-")
# rating = "High" if score >= 4 else "Medium" if score == 3 else "Low"
# st.markdown(f"<small>πΉ **{label}** | {rating} Relevance | Score: {score}/5 β <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)
# # Final Source List
# with st.expander("π Source Citations"):
# if unique_sources:
# for url, label in unique_sources.items():
# st.markdown(f"<small>πΉ **{label}**: <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)
# else:
# st.markdown("<small>No sources available.</small>", unsafe_allow_html=True)
|