File size: 13,468 Bytes
c960b7f
136b842
 
dd37de0
 
 
 
 
 
 
 
b0af36e
dd37de0
781e5ef
c960b7f
dd37de0
 
 
 
 
 
 
136b842
dd37de0
 
136b842
dd37de0
 
c960b7f
dd37de0
 
 
 
 
 
 
c960b7f
dd37de0
 
 
 
 
 
 
 
 
 
272c653
dd37de0
272c653
dd37de0
 
 
 
 
 
dc2e285
dd37de0
 
 
 
 
 
dc2e285
dd37de0
 
 
 
 
 
 
 
 
 
 
dc2e285
dd37de0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c960b7f
dd37de0
 
 
 
 
136b842
dd37de0
 
 
 
 
 
136b842
dd37de0
 
c960b7f
 
dd37de0
 
 
 
 
 
136b842
dd37de0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc2e285
dd37de0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272c653
dd37de0
 
 
136b842
dd37de0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c960b7f
dd37de0
 
c960b7f
 
dd37de0
c960b7f
dd37de0
 
 
 
 
 
 
 
272c653
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
import gradio as gr
import os
import json
import faiss
import numpy as np
import google.generativeai as genai
from newsapi import NewsApiClient
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Optional, Union

# --- Configuration ---

NEWS_API_KEY = os.getenv('NEWS_API_KEY')
GOOGLE_API_KEY = os.getenv('GEMINI_API_KEY')

if not NEWS_API_KEY:
    print("Warning: NEWS_API_KEY secret not found.")
    # Optionally raise an error or handle gracefully in the UI
if not GOOGLE_API_KEY:
    print("Warning: GOOGLE_API_KEY secret not found.")
    # Optionally raise an error or handle gracefully in the UI
else:
    try:
        # Configure Google Generative AI only if the key is present
        genai.configure(api_key=GOOGLE_API_KEY)
    except Exception as e:
        print(f"Error configuring Google Generative AI: {e}")
        # Handle configuration error

# --- Constants ---
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2' # Lightweight embedding model
LLM_MODEL_NAME = 'gemini-1.5-flash'      # Efficient Gemini model
MAX_ARTICLES_TO_FETCH = 15              # Fetch a bit more for better potential context
MAX_ARTICLES_TO_PROCESS = 7             # Process a reasonable number for context
CHUNK_SIZE = 500                        # Approximate characters per text chunk
TOP_K_CHUNKS = 4                        # Number of relevant chunks for LLM context

# --- Global Variables / Models (Load Once) ---
embedding_model = None
if GOOGLE_API_KEY: # Only load models if keys are likely set
    try:
        print("Loading embedding model...")
        embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
        print("Embedding model loaded.")
    except Exception as e:
        print(f"Error loading Sentence Transformer model '{EMBEDDING_MODEL_NAME}': {e}")
        # The app might still run but RAG will fail

# --- Helper Functions (Adapted from previous script) ---

def fetch_news(topic: str) -> List[Dict[str, Any]]:
    """Fetches recent news articles for a given topic using NewsAPI."""
    if not NEWS_API_KEY:
        print("News API key missing.")
        return []
    print(f"Fetching news for topic: {topic}...")
    try:
        newsapi = NewsApiClient(api_key=NEWS_API_KEY)
        top_headlines = newsapi.get_everything(
            q=topic,
            language='en',
            sort_by='relevancy',
            page_size=MAX_ARTICLES_TO_FETCH
        )
        articles = top_headlines.get('articles', [])
        valid_articles = [
            {
                "title": article.get("title"),
                "content": article.get("content") or article.get("description", ""),
                "url": article.get("url")
            }
            for article in articles if article.get("content") or article.get("description")
        ][:MAX_ARTICLES_TO_PROCESS] # Limit here
        print(f"Fetched {len(valid_articles)} valid articles.")
        return valid_articles
    except Exception as e:
        print(f"Error fetching news: {e}")
        return []

def chunk_text(text: str, size: int) -> List[str]:
    """Splits text into chunks."""
    chunks = []
    start = 0
    while start < len(text):
        end = start + size
        pos = text.rfind('.', start, min(end + 50, len(text)))
        if pos != -1 and pos > start + size // 2:
            end = pos + 1
        chunks.append(text[start:end].strip())
        start = end
    return [chunk for chunk in chunks if chunk]

def build_vector_store(articles: List[Dict[str, Any]], model: SentenceTransformer):
    """Creates embeddings and builds an in-memory FAISS index."""
    if model is None:
        print("Embedding model not loaded. Cannot build vector store.")
        return None, [], []
    print("Building vector store...")
    all_chunks = []
    metadata = []
    for i, article in enumerate(articles):
        if article.get('content'):
            chunks = chunk_text(article['content'], CHUNK_SIZE)
            for chunk in chunks:
                all_chunks.append(chunk)
                metadata.append({"article_index": i, "url": article.get('url'), "title": article.get('title')})

    if not all_chunks:
        print("No text content found to build vector store.")
        return None, [], []

    print(f"Generated {len(all_chunks)} chunks. Creating embeddings...")
    try:
        embeddings = model.encode(all_chunks, show_progress_bar=False) # Progress bar can be messy in logs
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(np.array(embeddings).astype('float32'))
        print("Vector store built successfully.")
        return index, all_chunks, metadata
    except Exception as e:
        print(f"Error creating embeddings or FAISS index: {e}")
        return None, [], []


def retrieve_context(query: str, index: faiss.Index, chunks: List[str], metadata: List[Dict], model: SentenceTransformer, top_k: int) -> str:
    """Retrieves the most relevant text chunks."""
    if model is None or index is None or index.ntotal == 0:
        return "No relevant context found (vector store/model unavailable)."

    print(f"Retrieving top {top_k} relevant chunks for query: '{query}'...")
    try:
        query_embedding = model.encode([query], show_progress_bar=False)
        query_embedding_np = np.array(query_embedding).astype('float32')
        distances, indices = index.search(query_embedding_np, min(top_k, index.ntotal)) # Ensure k <= index.ntotal

        context_parts = []
        seen_urls = set()
        retrieved_sources = [] # Track sources used in context

        for i, idx in enumerate(indices[0]):
            if 0 <= idx < len(chunks):
                chunk_text = chunks[idx]
                meta = metadata[idx]
                source_info = f"(Source: {meta.get('url', 'N/A')})"
                full_info = ""
                if meta.get('url') and meta['url'] not in seen_urls:
                    full_info = f"From '{meta.get('title', 'Untitled')}':\n{chunk_text}\n{source_info}"
                    seen_urls.add(meta['url'])
                    if meta.get('url'): retrieved_sources.append(meta['url'])
                else:
                    full_info = f"{chunk_text}\n{source_info}"
                    # Add source URL if available and not already added from this chunk group
                    if meta.get('url') and meta['url'] not in seen_urls:
                       seen_urls.add(meta['url'])
                       if meta.get('url'): retrieved_sources.append(meta['url'])

                context_parts.append(full_info)

        if not context_parts:
            return "No relevant context found matching the query."

        print(f"Retrieved {len(context_parts)} context parts.")
        # Return context and the list of sources used in that context
        return "\n\n".join(context_parts), list(set(retrieved_sources)) # Use set for uniqueness
    except Exception as e:
        print(f"Error during context retrieval: {e}")
        return "Error retrieving context.", []


def generate_structured_summary(context: str, topic: str) -> Optional[Dict[str, Any]]:
    """Generates a summary using Gemini with structured output."""
    if not GOOGLE_API_KEY:
        print("Google API Key missing. Cannot generate summary.")
        return None
    print("Generating structured summary with LLM...")
    try:
        model = genai.GenerativeModel(LLM_MODEL_NAME)
        json_schema = {
            "type": "object",
            "properties": {
                "topic": {"type": "string"},
                "summary_points": {"type": "array", "items": {"type": "string"}},
                "mentioned_sources": {"type": "array", "items": {"type": "string", "format": "uri"}}
            },
            "required": ["topic", "summary_points", "mentioned_sources"]
        }
        prompt = f"""
        Analyze the following retrieved context about '{topic}'. Create a concise summary highlighting the key information.
        Extract the main points and list the unique source URLs mentioned ONLY in the provided context below.
        Respond ONLY with a valid JSON object matching this schema:

        Schema:
        {json.dumps(json_schema, indent=2)}

        Retrieved Context:
        ---
        {context}
        ---

        JSON Output:
        """

        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                 response_mime_type="application/json"
            )
        )
        summary_json = json.loads(response.text)
        print("LLM generation successful.")
        return summary_json
    except Exception as e:
        print(f"Error during LLM generation or JSON parsing: {e}")
        try:
            # Try to log the raw response if possible for debugging
            print(f"LLM Raw Response Text (if available): {response.text}")
        except:
            pass
        return None

# --- Main Gradio Function ---
def summarize_news_interface(topic: str) -> Union[Dict, str]:
    """Orchestrates the news summarization process for the Gradio interface."""
    print(f"\n--- Processing request for topic: {topic} ---")
    if not topic:
        return {"error": "Please enter a topic."}
    if not NEWS_API_KEY or not GOOGLE_API_KEY:
         return {"error": "API Key secrets are not configured correctly in this Space."}
    if embedding_model is None:
        return {"error": "Embedding model could not be loaded. RAG is disabled."}

    # 1. Fetch News
    articles = fetch_news(topic)
    if not articles:
        return {"error": f"Could not fetch any news articles for '{topic}'. Please try a different topic or check NewsAPI key."}

    # 2. Build Vector Store (RAG - Embeddings & Indexing)
    vector_index, text_chunks, chunk_metadata = build_vector_store(articles, embedding_model)
    if vector_index is None:
        # Fallback or error - here we'll indicate RAG failed but might proceed without it later if desired
         return {"error": "Could not build vector store (likely no usable article content). RAG step failed."}

    # 3. Retrieve Relevant Context (RAG - Retrieval)
    context_result = retrieve_context(topic, vector_index, text_chunks, chunk_metadata, embedding_model, TOP_K_CHUNKS)

    # Check if retrieve_context returned a tuple (context, sources) or an error string
    if isinstance(context_result, tuple):
        retrieved_context, sources_in_context = context_result
        print(f"Context retrieved successfully. Sources in context: {len(sources_in_context)}")
    else: # Handle error string case
        retrieved_context = context_result # Contains the error message
        sources_in_context = []
        print(f"Context retrieval issue: {retrieved_context}")
        # Decide how to proceed. For now, we'll try generating without specific context.
        # A better approach might be to summarize top articles directly, or just show the error.
        # For simplicity, we will show an error JSON
        return {"error": "Failed to retrieve relevant context via RAG.", "details": retrieved_context}


    # 4. Generate Structured Summary (Document Understanding + Structured Output)
    # Pass only the sources found in the *retrieved context* to the LLM if needed,
    # but the current prompt asks it to extract from the context itself.
    summary_output = generate_structured_summary(retrieved_context, topic)

    if summary_output:
        # Ensure the sources list in the JSON only contains those from the context
        # The LLM should ideally handle this based on the prompt, but we can double-check/override.
        # summary_output['mentioned_sources'] = sources_in_context # Optional override
        print("--- Request processing complete ---")
        return summary_output
    else:
        print("--- Request processing failed at LLM step ---")
        # Provide specific error if LLM failed
        return {"error": "Failed to generate summary using the LLM.", "details": "Check logs for potential API errors or LLM issues."}

# --- Gradio Interface Definition ---
demo = gr.Interface(
    fn=summarize_news_interface,
    inputs=gr.Textbox(
        label="Enter News Topic",
        placeholder="e.g., latest advancements in renewable energy, Premier League results, space exploration updates..."
    ),
    outputs=gr.JSON(label="News Digest Summary"),
    title="📰 AI News Digest Generator",
    description=(
        "Enter a topic to get a structured summary of recent news articles.\n"
        "This app uses RAG (Retrieval Augmented Generation) with FAISS/SentenceTransformers "
        "and Google Gemini for summarization.\n"
    ),
    examples=[
        ["AI in healthcare"],
        ["Electric vehicle market trends"],
        ["Recent archaeological discoveries"]
    ],
    allow_flagging='never',
    # theme=gr.themes.Soft() # Optional: adds a theme
)

# --- Launch the App ---
if __name__ == "__main__":
    # Check for keys on launch locally (won't hurt on Spaces)
    if not NEWS_API_KEY or not GOOGLE_API_KEY:
        print("\n*** WARNING: API Keys not found as environment variables. ***")
        print("*** Please set NEWS_API_KEY and GOOGLE_API_KEY if running locally. ***")
        print("*** In Hugging Face Spaces, set them as Secrets in Settings. ***\n")
    elif embedding_model is None:
         print("\n*** WARNING: Embedding model failed to load. RAG features will not work. ***\n")

    demo.launch()