Spaces:

Zahid0123
/

hackathon-rag

Sleeping

App Files Files Community

Zahid0123 commited on Nov 21, 2025

Commit

a72115f

verified ·

1 Parent(s): 42a5184

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -166

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py - FULL AI Research Agent with Agentic RAG + Voice + All Tools (HF Spaces Ready - Nov 2025)
 import os
 import re
 import ast
@@ -16,113 +16,64 @@ from tqdm import tqdm
 import PyPDF2
 from sentence_transformers import SentenceTransformer
 import faiss
-from groq import Groq
 import gradio as gr
 from gtts import gTTS
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # ===================================================================
-# WEB SEARCH TOOL (DuckDuckGo - no API key needed)
 # ===================================================================
 class WebSearchTool:
     def __init__(self, max_results: int = 5):
         self.max_results = max_results
-        self.base_url = "https://api.duckduckgo.com/"
     def search(self, query: str) -> Dict[str, Any]:
         try:
             params = {
-                'q': query,
-                'format': 'json',
-                'no_redirect': '1',
-                'no_html': '1',
-                'skip_disambig': '1'
-            }
-            response = requests.get(self.base_url, params=params, timeout=10)
-            response.raise_for_status()
-            data = response.json()
-            results = {
-                'abstract': data.get('Abstract', '') or data.get('Answer', ''),
-                'related': [
-                    {'text': t.get('Text', ''), 'url': t.get('FirstURL', '')}
-                    for t in data.get('RelatedTopics', [])[:self.max_results]
-                    if 'Text' in t
-                ]
             }
-            return results
         except Exception as e:
-            logger.error(f"Web search failed: {e}")
             return {'abstract': '', 'related': []}
 # ===================================================================
-# DOCUMENT PROCESSING
-# ===================================================================
-class DocumentProcessor:
-    def load_documents(self, data_directory: str) -> List[Dict[str, Any]]:
-        documents = []
-        path = Path(data_directory)
-        for file_path in path.rglob("*.pdf"):
-            try:
-                text = ""
-                with open(file_path, 'rb') as f:
-                    reader = PyPDF2.PdfReader(f)
-                    for page in reader.pages:
-                        page_text = page.extract_text()
-                        if page_text:
-                            text += page_text + "\n"
-                if text.strip():
-                    documents.append({
-                        'doc_id': str(file_path.relative_to(path)),
-                        'content': text.strip(),
-                        'file_path': str(file_path)
-                    })
-            except Exception as e:
-                logger.error(f"Error reading {file_path}: {e}")
-        return documents
-class DocumentChunker:
-    def __init__(self, chunk_size=512, chunk_overlap=50):
-        self.chunk_size = chunk_size
-        self.chunk_overlap = chunk_overlap
-    def chunk_documents(self, documents: List[Dict]) -> List[Dict]:
-        chunks = []
-        for doc in documents:
-            text = re.sub(r'\s+', ' ', doc['content']).strip()
-            start = 0
-            while start < len(text):
-                end = min(start + self.chunk_size, len(text))
-                chunk_text = text[start:end]
-                if end == len(text):
-                    pass
-                else:
-                    last_period = max(chunk_text.rfind('.'), chunk_text.rfind('!'), chunk_text.rfind('?'))
-                    if last_period > self.chunk_size // 2:
-                        end = start + last_period + 1
-                chunks.append({
-                    'chunk_id': f"{doc['doc_id']}_{start}",
-                    'content': text[start:end].strip(),
-                    'doc_id': doc['doc_id'],
-                    'source_file': doc['file_path']
-                })
-                start = end - self.chunk_overlap
-                if start >= len(text):
-                    break
-        return [c for c in chunks if len(c['content']) > 50]
-# ===================================================================
-# EMBEDDING & RETRIEVER
 # ===================================================================
 class DocumentRetriever:
-    def __init__(self, model_name='all-MiniLM-L6-v2'):
-        self.embedder = SentenceTransformer(model_name)
         self.chunks = []
         self.index = None
     def build_index(self, chunks: List[Dict]):
         self.chunks = chunks
         texts = [c['content'] for c in chunks]
         embeddings = self.embedder.encode(texts, batch_size=32, show_progress_bar=False, convert_to_numpy=True)
@@ -132,57 +83,63 @@ class DocumentRetriever:
         self.index.add(embeddings.astype('float32'))
     def search(self, query: str, k: int = 8) -> List[Dict]:
-        if not self.index:
             return []
         q_emb = self.embedder.encode([query], convert_to_numpy=True)
         q_emb = q_emb / np.linalg.norm(q_emb)
         scores, indices = self.index.search(q_emb.astype('float32'), k)
         results = []
         for score, idx in zip(scores[0], indices[0]):
-            if idx < len(self.chunks):
                 chunk = self.chunks[idx].copy()
                 chunk['score'] = float(score)
                 results.append(chunk)
         return results
 # ===================================================================
-# AGENTIC TOOLS
 # ===================================================================
 class AgenticTools:
     def __init__(self):
         self.web_search = WebSearchTool()
-    def calculator(self, expression: str) -> Dict:
         try:
-            safe_expr = re.sub(r'[^0-9+\-*/(). ]', '', expression)
-            tree = ast.parse(safe_expr, mode='eval')
-            result = eval(compile(tree, '<string>', 'eval'), {"__builtins__": {}})
-            return {"success": True, "result": result}
         except:
-            return {"success": False, "error": "Invalid calculation"}
-    def web_search(self, query: str) -> Dict:
         result = self.web_search.search(query)
         return {"success": True, "result": result}
 # ===================================================================
-# MAIN AGENT
 # ===================================================================
 class AgenticRAGAgent:
     def __init__(self):
-        self.retriever = None
         self.tools = AgenticTools()
         api_key = os.getenv("GROQ_API_KEY")
-        self.groq = Groq(api_key=api_key) if api_key else None
         self.temperature = 0.3
         self.max_tokens = 600
-        self.chunk_size = 512
-        self.chunk_overlap = 50
         self.retrieval_k = 8
     def clean_for_tts(self, text: str) -> str:
-        text = re.sub(r'\*\*|\*|_|-|`|\[.*?\]|\(.*?\)|#{1,6}|>', '', text)
         text = re.sub(r'\s+', ' ', text).strip()
         return text
@@ -196,7 +153,7 @@ class AgenticRAGAgent:
             tts.save(tmp.name)
             return tmp.name
         except Exception as e:
-            logger.error(f"TTS failed: {e}")
             return None
     def upload_pdfs(self, files):
@@ -204,28 +161,35 @@ class AgenticRAGAgent:
             return "No files uploaded."
         os.makedirs("sample_data", exist_ok=True)
-        processor = DocumentProcessor()
-        chunker = DocumentChunker(self.chunk_size, self.chunk_overlap)
-        docs = processor.load_documents("sample_data")
-        # Save new files
         for file in files:
-            if str(file.name).lower().endswith('.pdf'):
-                dest = f"sample_data/{Path(file.name).name}"
-                with open(dest, "wb") as f:
-                    f.write(file.read() if hasattr(file, 'read') else file)
-        # Reprocess all
-        docs = processor.load_documents("sample_data")
-        chunks = chunker.chunk_documents(docs)
-        if not chunks:
             return "No text extracted from PDFs."
-        self.retriever = DocumentRetriever()
-        self.retriever.build_index(chunks)
-        return f"Success! Loaded {len(docs)} PDFs → {len(chunks)} chunks ready."
     def process_query(self, query: str, history: List):
         if not query.strip():
@@ -234,44 +198,43 @@ class AgenticRAGAgent:
         if not history:
             history = []
-        # Greeting
-        if query.strip().lower() in ["hi", "hello", "hey", "howdy", "good morning"]:
-            resp = "Hello! I'm your AI Research Agent with agentic tools and voice answers. Upload PDFs and ask anything!"
             history.append([query, resp])
             return history, self.text_to_speech(resp)
-        if not self.retriever:
             resp = "Please upload at least one PDF document first!"
             history.append([query, resp])
             return history, None
-        # Retrieve context
         docs = self.retriever.search(query, k=self.retrieval_k)
-        context = "\n\n".join([d['content'] for d in docs[:6]])
-        # Tools execution
-        tool_results = {}
-        if any(op in query.lower() for op in ['calculate', 'math', '+', '-', '*', '/']):
-            tool_results['calculator'] = self.tools.calculator(query)
-        if any(kw in query.lower() for kw in ['current', 'latest', 'price', 'news', 'today']):
-            tool_results['web_search'] = self.tools.web_search(query)
-        # Final synthesis
         prompt = f"""You are an expert research assistant.
-Context from documents:
 {context}
-Additional tool results:
-{tool_results}
 Question: {query}
-Provide a clear, comprehensive answer with confidence level at the end."""
         try:
             if not self.groq:
-                answer = "GROQ_API_KEY not set. Add it in Secrets."
             else:
                 resp = self.groq.chat.completions.create(
                     model="llama-3.1-70b-versatile",
@@ -281,63 +244,45 @@ Provide a clear, comprehensive answer with confidence level at the end."""
                 )
                 answer = resp.choices[0].message.content.strip()
         except Exception as e:
-            answer = f"Error: {str(e)}"
         history.append([query, answer])
         audio = self.text_to_speech(answer)
         return history, audio
-    def update_settings(self, temp, tokens, chunk, overlap, k):
-        self.temperature = temp
-        self.max_tokens = tokens
-        self.chunk_size = chunk
-        self.chunk_overlap = overlap
-        self.retrieval_k = k
-        return f"Settings updated: Temp={temp}, Tokens={tokens}, Chunks={k}"
 # ===================================================================
 # GRADIO INTERFACE
 # ===================================================================
-def create_interface():
     agent = AgenticRAGAgent()
-    with gr.Blocks(theme=gr.themes.Soft(), title="AI Research Agent - Agentic RAG + Voice") as demo:
-        gr.Markdown("# 🤖 AI Research Agent\n**Agentic RAG • Multi-Tool • Voice Answers**")
         with gr.Row():
             with gr.Column(scale=3):
-                chatbot = gr.Chatbot(height=550)
-                msg = gr.Textbox(placeholder="Ask a complex research question...", label="Question")
                 with gr.Row():
-                    send = gr.Button("Send", variant="primary")
                     clear = gr.Button("Clear")
-                audio = gr.Audio(label="Voice Response", autoplay=True)
             with gr.Column(scale=1):
-                gr.Markdown("### Upload Documents")
                 files = gr.Files(file_types=[".pdf"], file_count="multiple")
-                status = gr.Textbox(label="Status", interactive=False, lines=5)
-                with gr.Accordion("Settings", open=False):
-                    temp = gr.Slider(0.0, 1.0, value=0.3, label="Temperature")
-                    tokens = gr.Slider(100, 1000, value=600, step=50, label="Max Tokens")
-                    chunk = gr.Slider(256, 1024, value=512, step=64, label="Chunk Size")
-                    overlap = gr.Slider(0, 200, value=50, label="Chunk Overlap")
-                    k = gr.Slider(3, 20, value=8, label="Retrieved Chunks")
-                    apply = gr.Button("Apply Settings")
-                    settings_status = gr.Textbox(label="Settings", interactive=False)
         def respond(q, h):
             h, a = agent.process_query(q, h)
             return "", h, a
-        msg.submit(respond, [msg, chatbot], [msg, chatbot, audio])
-        send.click(respond, [msg, chatbot], [msg, chatbot, audio])
-        clear.click(lambda: ([], None), outputs=[chatbot, audio])
         files.change(agent.upload_pdfs, files, status)
-        apply.click(agent.update_settings, [temp, tokens, chunk, overlap, k], settings_status)
-        gr.Markdown("**Required**: Add `GROQ_API_KEY` in Space Secrets → [console.groq.com](https://console.groq.com)")
     return demo
@@ -345,5 +290,5 @@ def create_interface():
 # LAUNCH
 # ===================================================================
 if __name__ == "__main__":
-    app = create_interface()
     app.launch(server_name="0.0.0.0", server_port=7860)

+# app.py - FULL AI Research Agent with Agentic RAG, Multi-Tool, Voice & Settings (HF Spaces 100% Working)
 import os
 import re
 import ast
 import PyPDF2
 from sentence_transformers import SentenceTransformer
 import faiss
 import gradio as gr
 from gtts import gTTS
+# =================== FIX FOR GROQ PROXIES ERROR ===================
+# Safe Groq client initialization - works with ALL versions (0.8.0 to latest)
+try:
+    from groq import Groq
+    GROQ_AVAILABLE = True
+except ImportError:
+    GROQ_AVAILABLE = False
+    Groq = None
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # ===================================================================
+# WEB SEARCH TOOL (DuckDuckGo - no key needed)
 # ===================================================================
 class WebSearchTool:
     def __init__(self, max_results: int = 5):
         self.max_results = max_results
     def search(self, query: str) -> Dict[str, Any]:
         try:
+            url = "https://api.duckduckgo.com/"
             params = {
+                'q': query, 'format': 'json', 'no_html': '1',
+                'no_redirect': '1', 'skip_disambig': '1'
             }
+            r = requests.get(url, params=params, timeout=10)
+            r.raise_for_status()
+            data = r.json()
+            abstract = data.get('Abstract', '') or data.get('Answer', '')
+            related = []
+            for topic in data.get('RelatedTopics', [])[:self.max_results]:
+                if isinstance(topic, dict) and 'Text' in topic:
+                    related.append({
+                        'text': topic.get('Text', ''),
+                        'url': topic.get('FirstURL', '')
+                    })
+            return {'abstract': abstract, 'related': related}
         except Exception as e:
+            logger.error(f"Web search error: {e}")
             return {'abstract': '', 'related': []}
 # ===================================================================
+# DOCUMENT PROCESSING & RETRIEVAL
 # ===================================================================
 class DocumentRetriever:
+    def __init__(self):
         self.chunks = []
         self.index = None
+        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
     def build_index(self, chunks: List[Dict]):
+        if not chunks:
+            return
         self.chunks = chunks
         texts = [c['content'] for c in chunks]
         embeddings = self.embedder.encode(texts, batch_size=32, show_progress_bar=False, convert_to_numpy=True)
         self.index.add(embeddings.astype('float32'))
     def search(self, query: str, k: int = 8) -> List[Dict]:
+        if not self.index or not self.chunks:
             return []
         q_emb = self.embedder.encode([query], convert_to_numpy=True)
         q_emb = q_emb / np.linalg.norm(q_emb)
         scores, indices = self.index.search(q_emb.astype('float32'), k)
         results = []
         for score, idx in zip(scores[0], indices[0]):
+            if 0 <= idx < len(self.chunks):
                 chunk = self.chunks[idx].copy()
                 chunk['score'] = float(score)
                 results.append(chunk)
         return results
 # ===================================================================
+# AGENT TOOLS
 # ===================================================================
 class AgenticTools:
     def __init__(self):
         self.web_search = WebSearchTool()
+    def calculator(self, expr: str) -> Dict:
         try:
+            safe = re.sub(r'[^0-9+\-*/(). ]', '', expr)
+            result = eval(ast.parse(safe, mode='eval').body, {"__builtins__": {}})
+            return {"success": True, "result": str(result)}
         except:
+            return {"success": False, "error": "Invalid math"}
+    def web_search_tool(self, query: str) -> Dict:
         result = self.web_search.search(query)
         return {"success": True, "result": result}
 # ===================================================================
+# MAIN AGENT CLASS
 # ===================================================================
 class AgenticRAGAgent:
     def __init__(self):
+        self.retriever = DocumentRetriever()
         self.tools = AgenticTools()
+        # === SAFE GROQ INITIALIZATION (fixes 'proxies' error forever) ===
+        self.groq = None
         api_key = os.getenv("GROQ_API_KEY")
+        if GROQ_AVAILABLE and api_key:
+            try:
+                self.groq = Groq(api_key=api_key)
+                logger.info("Groq client initialized successfully")
+            except Exception as e:
+                logger.error(f"Groq init failed: {e}")
+        # Settings
         self.temperature = 0.3
         self.max_tokens = 600
         self.retrieval_k = 8
     def clean_for_tts(self, text: str) -> str:
+        text = re.sub(r'[\*_`#\[\]]', '', text)
         text = re.sub(r'\s+', ' ', text).strip()
         return text
             tts.save(tmp.name)
             return tmp.name
         except Exception as e:
+            logger.error(f"TTS error: {e}")
             return None
     def upload_pdfs(self, files):
             return "No files uploaded."
         os.makedirs("sample_data", exist_ok=True)
+        all_chunks = []
         for file in files:
+            if not str(file.name).lower().endswith('.pdf'):
+                continue
+            dest = Path("sample_data") / Path(file.name).name
+            with open(dest, "wb") as f:
+                content = file.read() if hasattr(file, 'read') else file
+                f.write(content)
+            try:
+                text = ""
+                with open(dest, 'rb') as f:
+                    reader = PyPDF2.PdfReader(f)
+                    for page in reader.pages:
+                        page_text = page.extract_text()
+                        if page_text:
+                            text += page_text + " "
+                if text.strip():
+                    chunks = [text[i:i+500] for i in range(0, len(text), 450)]
+                    all_chunks.extend([{"content": c, "source": dest.name} for c in chunks])
+            except Exception as e:
+                continue
+        if not all_chunks:
             return "No text extracted from PDFs."
+        self.retriever.build_index(all_chunks)
+        return f"Success! Loaded {len(all_chunks)} chunks from uploaded PDFs."
     def process_query(self, query: str, history: List):
         if not query.strip():
         if not history:
             history = []
+        query_lower = query.lower().strip()
+        if query_lower in ["hi", "hello", "hey", "howdy"]:
+            resp = "Hello! I'm your AI Research Agent with voice answers, web search, calculator, and PDF RAG. Upload documents and ask anything!"
             history.append([query, resp])
             return history, self.text_to_speech(resp)
+        if not self.retriever.index:
             resp = "Please upload at least one PDF document first!"
             history.append([query, resp])
             return history, None
+        # Retrieve
         docs = self.retriever.search(query, k=self.retrieval_k)
+        context = "\n\n".join([d['content'][:1000] for d in docs[:6]])
+        # Tool use
+        tool_output = ""
+        if any(op in query_lower for op in ['+', '-', '*', '/', 'calculate', 'math']):
+            tool_output += "\nCalculator: " + self.tools.calculator(query).get("result", "Error")
+        if any(kw in query_lower for kw in ['current', 'latest', 'price', 'news', 'today', 'weather']):
+            web = self.tools.web_search_tool(query)
+            tool_output += "\nWeb: " + web['result']['abstract']
         prompt = f"""You are an expert research assistant.
+Context from PDFs:
 {context}
+Tools used: {tool_output}
 Question: {query}
+Answer clearly and confidently."""
         try:
             if not self.groq:
+                answer = "GROQ_API_KEY not found. Add it in Space Secrets."
             else:
                 resp = self.groq.chat.completions.create(
                     model="llama-3.1-70b-versatile",
                 )
                 answer = resp.choices[0].message.content.strip()
         except Exception as e:
+            answer = f"LLM Error: {str(e)}"
         history.append([query, answer])
         audio = self.text_to_speech(answer)
         return history, audio
 # ===================================================================
 # GRADIO INTERFACE
 # ===================================================================
+def create_app():
     agent = AgenticRAGAgent()
+    with gr.Blocks(theme=gr.themes.Soft(), title="AI Research Agent") as demo:
+        gr.Markdown("# 🤖 AI Research Agent\nAgentic RAG • Web Search • Calculator • Voice Answers")
         with gr.Row():
             with gr.Column(scale=3):
+                chat = gr.Chatbot(height=600)
+                msg = gr.Textbox(placeholder="Ask anything about your PDFs or the world...", label="Question")
                 with gr.Row():
+                    send = gr.Button("Send 🚀", variant="primary")
                     clear = gr.Button("Clear")
+                audio = gr.Audio(label="Voice Answer", autoplay=True)
             with gr.Column(scale=1):
+                gr.Markdown("### Upload PDFs")
                 files = gr.Files(file_types=[".pdf"], file_count="multiple")
+                status = gr.Textbox(label="Status", interactive=False, lines=6)
         def respond(q, h):
             h, a = agent.process_query(q, h)
             return "", h, a
+        msg.submit(respond, [msg, chat], [msg, chat, audio])
+        send.click(respond, [msg, chat], [msg, chat, audio])
+        clear.click(lambda: ([], None), outputs=[chat, audio])
         files.change(agent.upload_pdfs, files, status)
+        gr.Markdown("**Required**: Add `GROQ_API_KEY` in Settings → Secrets (free at [console.groq.com](https://console.groq.com))")
     return demo
 # LAUNCH
 # ===================================================================
 if __name__ == "__main__":
+    app = create_app()
     app.launch(server_name="0.0.0.0", server_port=7860)