Spaces:

Kakaarot
/

AI-Powered-News-Digest-Dynamo

Sleeping

App Files Files Community

Kakaarot commited on Apr 20

Commit

dc2e285

verified ·

1 Parent(s): 272c653

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -54

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from retry import retry
 import os
 import json
-# Configure Gemini API with key from Hugging Face Secrets
 api_key = os.getenv("GEMINI_API_KEY")
 if not api_key:
     raise ValueError("GEMINI_API_KEY environment variable not set")
@@ -25,50 +25,44 @@ articles = [
     "Coral reefs face bleaching from rising ocean temperatures."
 ]
-# Generate embeddings
-embedding_model = "models/embedding-001"  # Update to correct model name
 df = pd.DataFrame({"article": articles})
 @retry(tries=3, delay=2, backoff=2)
 def get_embedding(text):
     try:
         result = genai.embed_content(model=embedding_model, content=text, task_type="RETRIEVAL_DOCUMENT")
-        # Extract embedding correctly based on API response structure
         embedding = result.embedding
         return embedding
     except Exception as e:
         print(f"Embedding error: {e}")
         raise
-# Generate all embeddings first
-all_embeddings = []
-for article in articles:
-    try:
-        embedding = get_embedding(article)
-        all_embeddings.append(embedding)
-    except Exception as e:
-        print(f"Failed to embed article: {article[:30]}... Error: {e}")
-        all_embeddings.append([0] * 768)  # Default embedding dimension, adjust if needed
-df["embedding"] = all_embeddings
-# Initialize ChromaDB
 client_db = chromadb.Client()
 collection = client_db.get_or_create_collection("news_articles")
 # Clear existing data to avoid duplicates
 try:
     collection.delete(ids=[str(i) for i in range(len(df))])
-except:
     pass  # Collection might be empty
-# Add documents to collection
 for idx, row in df.iterrows():
-    collection.add(
-        documents=[row["article"]],
-        embeddings=[row["embedding"]],
-        ids=[str(idx)]
-    )
 # Semantic Search
 @retry(tries=3, delay=2, backoff=2)
@@ -76,6 +70,8 @@ def search_articles(query, top_k=3):
     try:
         query_embedding = get_embedding(query)
         results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
         indices = [int(id) for id in results["ids"][0]]
         return df.iloc[indices]["article"].tolist()
     except Exception as e:
@@ -83,7 +79,7 @@ def search_articles(query, top_k=3):
         return []
 # RAG and Structured Q&A
-generation_model = genai.GenerativeModel("gemini-1.5-pro")  # Verify model name
 @retry(tries=3, delay=2, backoff=2)
 def generate_response(query, articles, system_message):
@@ -91,13 +87,6 @@ def generate_response(query, articles, system_message):
         return "No relevant articles found.", json.dumps({"error": "No relevant articles found."})
     context = "\n".join(articles)
-    safety_settings = [
-        {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
-        {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
-        {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
-        {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
-    ]
     prompt = f"""
     {system_message}
     Based on the following articles, provide a concise summary (under 100 words) and a structured JSON response with 'question', 'answer', and 'source'. Use only the provided context.
@@ -111,41 +100,45 @@ def generate_response(query, articles, system_message):
     - Summary:
     - JSON:
     """
     try:
-        generation_config = {
-            "temperature": 0.7,
-            "top_p": 0.95,
-            "top_k": 40,
-            "max_output_tokens": 1024,
-        }
         response = generation_model.generate_content(
             prompt,
-            generation_config=generation_config,
-            safety_settings=safety_settings,
             stream=False
         )
         full_text = response.text
-        # Parse response
-        summary_end = full_text.find("- JSON:")
-        summary = full_text[full_text.find("- Summary:") + len("- Summary:"):summary_end].strip() if "- Summary:" in full_text else "Summary not generated."
-        qa_json = full_text[summary_end + len("- JSON:"):].strip()
-        # Clean up the JSON string to make it parseable
-        qa_json = qa_json.replace("``````", "").strip()
-        try:
-            qa = json.loads(qa_json)
-        except json.JSONDecodeError:
-            print(f"JSON parse error. Raw string: {qa_json}")
-            qa = {"error": "Failed to parse JSON response.", "raw_text": qa_json}
-        return summary, json.dumps(qa, indent=2)
     except Exception as e:
         print(f"RAG error: {e}")
-        return "Error generating response.", json.dumps({"error": f"Failed to generate response: {str(e)}"})
 def respond(message, history, system_message="You are a news summarizer and Q&A assistant.", max_tokens=512, temperature=0.7, top_p=0.95):
     articles = search_articles(message)

 import os
 import json
+# API Key Validation
 api_key = os.getenv("GEMINI_API_KEY")
 if not api_key:
     raise ValueError("GEMINI_API_KEY environment variable not set")
     "Coral reefs face bleaching from rising ocean temperatures."
 ]
+# Generate embeddings - with corrected model name
+embedding_model = "models/embedding-001"  # Correct model name
 df = pd.DataFrame({"article": articles})
 @retry(tries=3, delay=2, backoff=2)
 def get_embedding(text):
     try:
         result = genai.embed_content(model=embedding_model, content=text, task_type="RETRIEVAL_DOCUMENT")
+        # Correct way to access embedding
         embedding = result.embedding
         return embedding
     except Exception as e:
         print(f"Embedding error: {e}")
         raise
+# Generate embeddings and ensure they're in the correct format
+df["embedding"] = df["article"].apply(get_embedding)
+# Initialize ChromaDB with proper error handling
 client_db = chromadb.Client()
 collection = client_db.get_or_create_collection("news_articles")
 # Clear existing data to avoid duplicates
 try:
     collection.delete(ids=[str(i) for i in range(len(df))])
+except Exception:
     pass  # Collection might be empty
+# Add documents with error handling
 for idx, row in df.iterrows():
+    try:
+        collection.add(
+            documents=[row["article"]],
+            embeddings=[row["embedding"]],
+            ids=[str(idx)]
+        )
+    except Exception as e:
+        print(f"Error adding document {idx}: {e}")
 # Semantic Search
 @retry(tries=3, delay=2, backoff=2)
     try:
         query_embedding = get_embedding(query)
         results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
+        if not results["ids"][0]:
+            return []
         indices = [int(id) for id in results["ids"][0]]
         return df.iloc[indices]["article"].tolist()
     except Exception as e:
         return []
 # RAG and Structured Q&A
+generation_model = genai.GenerativeModel("gemini-1.5-pro")  # Corrected model name
 @retry(tries=3, delay=2, backoff=2)
 def generate_response(query, articles, system_message):
         return "No relevant articles found.", json.dumps({"error": "No relevant articles found."})
     context = "\n".join(articles)
     prompt = f"""
     {system_message}
     Based on the following articles, provide a concise summary (under 100 words) and a structured JSON response with 'question', 'answer', and 'source'. Use only the provided context.
     - Summary:
     - JSON:
     """
     try:
         response = generation_model.generate_content(
             prompt,
+            generation_config={
+                "temperature": 0.7,
+                "top_p": 0.95,
+                "max_output_tokens": 1024,
+            },
             stream=False
         )
         full_text = response.text
+        # Robust parsing
+        summary = "Summary not generated."
+        if "- Summary:" in full_text:
+            summary_start = full_text.find("- Summary:") + len("- Summary:")
+            summary_end = full_text.find("- JSON:", summary_start)
+            if summary_end > summary_start:
+                summary = full_text[summary_start:summary_end].strip()
+        qa_json = "{}"
+        if "- JSON:" in full_text:
+            json_start = full_text.find("- JSON:") + len("- JSON:")
+            qa_json_text = full_text[json_start:].strip()
+            # Clean up the JSON string - remove markdown code blocks
+            qa_json_text = qa_json_text.replace("``````", "").strip()
+            try:
+                qa = json.loads(qa_json_text)
+                qa_json = json.dumps(qa, indent=2)
+            except json.JSONDecodeError:
+                qa_json = json.dumps({"error": "Failed to parse JSON response.", "raw_text": qa_json_text})
+        return summary, qa_json
     except Exception as e:
         print(f"RAG error: {e}")
+        return f"Error generating response: {str(e)}", json.dumps({"error": f"Failed to generate response: {str(e)}"})
 def respond(message, history, system_message="You are a news summarizer and Q&A assistant.", max_tokens=512, temperature=0.7, top_p=0.95):
     articles = search_articles(message)