File size: 10,333 Bytes
f085180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# pipeline/answer_generator.py

import streamlit as st
import google.generativeai as genai
from collections import defaultdict
import math
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

def score_documents_with_gemini(docs, user_query):
    """

    Uses Gemini to rate how relevant each document is to the user's query (scale 1–5).

    """
    model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
    scored_docs = []

    for i, doc in enumerate(docs):
        url = doc.metadata.get("url", "No URL")
        snippet = doc.page_content[:500]

        prompt = f"""

        You are a relevance evaluator assistant.



        Given the user's query and a document snippet, score how relevant this document is (1 to 5 scale):



        User Query:

        \"\"\"{user_query}\"\"\"



        Document Snippet:

        \"\"\"{snippet}\"\"\"



        Return a single number (1-5):

        """

        try:
            response = model.generate_content(prompt)
            score = int("".join(filter(str.isdigit, response.text.strip())))
        except Exception as e:
            logger.warning(f"Scoring failed for URL {url}: {e}")
            score = 3  # Default fallback score

        scored_docs.append({
            "doc": doc,
            "url": url,
            "score": score
        })

    scored_docs.sort(key=lambda x: x["score"], reverse=True)
    return scored_docs

def get_diverse_documents(scored_docs, min_required):
    """

    Ensure documents are picked from diverse URLs.

    """
    by_url = defaultdict(list)
    for entry in scored_docs:
        by_url[entry["url"]].append(entry)

    diverse_docs = []
    for entries in by_url.values():
        if len(diverse_docs) >= min_required:
            break
        diverse_docs.append(entries[0])

    return diverse_docs

def generate_answer(user_query, retriever, scraped_results):
    """

    Final stage β€” Synthesizes a markdown answer using top documents and Gemini.

    """
    try:
        raw_docs = retriever.get_relevant_documents(user_query)
        scored_docs = score_documents_with_gemini(raw_docs, user_query)

        min_required = max(1, math.ceil(0.7 * len(scraped_results)))
        selected_docs = get_diverse_documents(scored_docs, min_required)

        context = ""
        unique_sources = {}

        for i, entry in enumerate(selected_docs):
            doc = entry["doc"]
            url = doc.metadata.get("url", "No URL Provided")
            if url not in unique_sources:
                unique_sources[url] = f"Source {len(unique_sources) + 1}"
            label = unique_sources[url]
            context += f"[{label}] ({url}):\n{doc.page_content}\n\n"

        # Generate markdown report
        prompt = f"""

        You are a senior research analyst generating professional and comprehensive reports based on the provided source documents.



        Your task is to synthesize a detailed, structured, and insight-rich report in Markdown format **based strictly on the content below**.



        ---



        ### Instructions:

        - Use markdown headings (##, ###) for report sections.

        - Use bullet points only where necessary. Prefer **paragraphs with facts**.

        - If **tabular or comparative data** (e.g., pricing, features, plans, specs, performance) is mentioned in any source, format it as a **Markdown table** with headers.

        - Do not skip important numerical, plan, or policy info just because it's complex β€” break it down in tables or bulleted blocks as needed.

        - Use inline citations like [Source 1], [Source 2], etc. after each fact.

        - You MUST incorporate information from **at least 3 different sources**.

        - If some sources contain overlapping content, reference each one explicitly.

        - Avoid relying on just one source unless it's the only one with that information.

        - If you cannot extract anything unique from a source, mention this in the "Source Coverage" section at the end.



        ---



        ### Output Format:

        - Clean, structured Markdown

        - Use clear section titles like `## Overview`, `## Key Pricing Details`, `## Feature Comparison`, `## Industry Use Cases`, etc.

        - Include at least **one table** if the data structure allows.

        - End with a **Source Coverage Summary** explaining which sources were used and how.



        ---



        ### SOURCE DOCUMENTS:

        {context}



        ---



        ### USER QUERY:

        \"\"\"{user_query}\"\"\"



        Please begin.



        """

        model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
        response = model.generate_content(prompt)
        answer = response.text.strip()

        st.markdown("### 🧠 Answer")
        if any(r.get("page", 1) > 1 for r in scraped_results):
            st.markdown("<span style='color:orange; font-weight:bold;'>⚠️ Includes content from deeper Google search pages</span>", unsafe_allow_html=True)
        st.markdown(answer, unsafe_allow_html=True)

        with st.expander("πŸ“Š Document Relevance Ranking"):
            for entry in scored_docs:
                url = entry["url"]
                score = entry["score"]
                label = unique_sources.get(url, "-")
                rating = "High" if score >= 4 else "Medium" if score == 3 else "Low"
                st.markdown(f"<small>πŸ”Ή **{label}** | {rating} Relevance | Score: {score}/5 β€” <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)

        with st.expander("πŸ”— Source Citations"):
            for url, label in unique_sources.items():
                st.markdown(f"<small>πŸ”Ή **{label}**: <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)

    except Exception as e:
        st.error(f"Failed to generate answer: {e}")
        logger.exception("Gemini synthesis failed.")




# import streamlit as st
# import google.generativeai as genai
# from collections import defaultdict
# import math
# from google.api_core.exceptions import ResourceExhausted

# def score_documents_with_gemini(docs, user_query):
#     model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
#     scored_docs = []

#     for i, doc in enumerate(docs):
#         url = doc.metadata.get("url", "No URL")
#         snippet = doc.page_content[:500]

#         prompt = f"""
#             You are a relevance evaluator assistant.

#             Given the user's query and a document snippet, score how relevant this document is (1 to 5 scale):

#             User Query:
#             \"\"\"{user_query}\"\"\"

#             Document Snippet:
#             \"\"\"{snippet}\"\"\"

#             Return a single number (1-5):
#         """
#         try:
#             response = model.generate_content(prompt)
#             score = int("".join(filter(str.isdigit, response.text.strip())))
#         except:
#             score = 3  # fallback

#         scored_docs.append({
#             "doc": doc,
#             "url": url,
#             "score": score
#         })

#     scored_docs.sort(key=lambda x: x["score"], reverse=True)
#     return scored_docs

# def get_diverse_documents(scored_docs, min_required):
#     by_url = defaultdict(list)
#     for entry in scored_docs:
#         by_url[entry["url"]].append(entry)

#     diverse_docs = []
#     for entries in by_url.values():
#         if len(diverse_docs) >= min_required:
#             break
#         diverse_docs.append(entries[0])  # take top scoring for each unique URL

#     return diverse_docs

# def generate_answer(user_query, retriever, scraped_results):
#     raw_docs = retriever.get_relevant_documents(user_query)
#     scored_docs = score_documents_with_gemini(raw_docs, user_query)

#     min_required = max(1, int(0.7 * len(scraped_results)))
#     selected_docs = get_diverse_documents(scored_docs, min_required=math.ceil(0.7 * len(scraped_results)))


#     context = ""
#     unique_sources = {}
#     for i, entry in enumerate(selected_docs):
#         doc = entry["doc"]
#         url = doc.metadata.get("url", "No URL Provided")
#         if url not in unique_sources:
#             unique_sources[url] = f"Source {len(unique_sources) + 1}"
#         label = unique_sources[url]
#         context += f"[{label}] ({url}):\n{doc.page_content}\n\n"



#     model = genai.GenerativeModel("gemini-2.0-pro-exp-02-05")
#     try:
#         response = model.generate_content(prompt)
#         answer = response.text.strip()
#     except ResourceExhausted as e:
#         st.error("🚫 Gemini quota limit exceeded. Please upgrade your plan or try again tomorrow.\n\nDetails: You’ve hit the free-tier limit of 25 requests/day for this model.")
#         return
#     except Exception as e:
#         st.error(f"❌ An unexpected error occurred while generating the answer:\n\n{str(e)}")
#         return

#     st.markdown("### 🧠 Answer")
#     if any(r.get("page", 1) > 1 for r in scraped_results):
#         st.markdown("<span style='color:orange; font-weight:bold;'>⚠️ Includes content from deeper Google search pages</span>", unsafe_allow_html=True)
#     st.markdown(answer, unsafe_allow_html=True)

#     # Relevance Table UI
#     with st.expander("πŸ“Š Document Relevance Ranking"):
#         for entry in scored_docs:
#             url = entry["url"]
#             score = entry["score"]
#             label = unique_sources.get(url, "-")
#             rating = "High" if score >= 4 else "Medium" if score == 3 else "Low"
#             st.markdown(f"<small>πŸ”Ή **{label}** | {rating} Relevance | Score: {score}/5 β€” <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)

#     # Final Source List
#     with st.expander("πŸ”— Source Citations"):
#         if unique_sources:
#             for url, label in unique_sources.items():
#                 st.markdown(f"<small>πŸ”Ή **{label}**: <a href='{url}' target='_blank'>{url}</a></small>", unsafe_allow_html=True)
#         else:
#             st.markdown("<small>No sources available.</small>", unsafe_allow_html=True)