Spaces:

Zahid0123
/

hackathon-rag

Sleeping

App Files Files Community

Zahid0123 commited on Nov 21, 2025

Commit

8f2c55b

verified ·

1 Parent(s): bd04dfb

Create app.py

Browse files

Files changed (1) hide show

app.py +319 -0

app.py ADDED Viewed

	@@ -0,0 +1,319 @@

+# app.py - FULLY WORKING AI Research Agent for Hugging Face Spaces
+import os
+import re
+import ast
+import operator
+import logging
+import requests
+import tempfile
+import time
+from pathlib import Path
+from typing import List, Dict, Any
+from datetime import datetime
+import numpy as np
+from tqdm import tqdm
+import PyPDF2
+from sentence_transformers import SentenceTransformer
+import faiss
+from groq import Groq
+import gradio as gr
+from gtts import gTTS
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ========================================
+# TOOLS & UTILITIES
+# ========================================
+class WebSearchTool:
+    def __init__(self, max_results: int = 5):
+        self.max_results = max_results
+        self.base_url = "https://api.duckduckgo.com/"
+    def search(self, query: str) -> Dict[str, Any]:
+        try:
+            params = {
+                'q': query, 'format': 'json', 'no_redirect': '1',
+                'no_html': '1', 'skip_disambig': '1'
+            }
+            response = requests.get(self.base_url, params=params, timeout=10)
+            response.raise_for_status()
+            data = response.json()
+            results = {
+                'abstract': data.get('Abstract', '') or data.get('Answer', ''),
+                'related': [
+                    {'text': t.get('Text', ''), 'url': t.get('FirstURL', '')}
+                    for t in data.get('RelatedTopics', [])[:self.max_results]
+                    if 'Text' in t
+                ]
+            }
+            return results
+        except Exception as e:
+            logger.error(f"Web search failed: {e}")
+            return {'abstract': '', 'related': []}
+# ========================================
+# DOCUMENT PROCESSING
+# ========================================
+class DocumentProcessor:
+    def load_documents(self, data_directory: str) -> List[Dict[str, Any]]:
+        documents = []
+        path = Path(data_directory)
+        for file_path in path.rglob("*.pdf"):
+            try:
+                text = ""
+                with open(file_path, 'rb') as f:
+                    reader = PyPDF2.PdfReader(f)
+                    for page in reader.pages:
+                        page_text = page.extract_text()
+                        if page_text:
+                            text += page_text + "\n"
+                if text.strip():
+                    documents.append({
+                        'doc_id': str(file_path.relative_to(path)),
+                        'content': text,
+                        'file_path': str(file_path)
+                    })
+            except Exception as e:
+                logger.error(f"Error reading {file_path}: {e}")
+        return documents
+class DocumentChunker:
+    def __init__(self, chunk_size=512, chunk_overlap=50):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def chunk_documents(self, documents: List[Dict]) -> List[Dict]:
+        chunks = []
+        for doc in documents:
+            text = re.sub(r'\s+', ' ', doc['content']).strip()
+            start = 0
+            while start < len(text):
+                end = start + self.chunk_size
+                chunk = text[start:end]
+                if end < len(text):
+                    last_space = chunk.rfind(' ')
+                    if last_space > self.chunk_size // 2:
+                        end = start + last_space
+                chunks.append({
+                    'chunk_id': f"{doc['doc_id']}_{start}",
+                    'content': text[start:end].strip(),
+                    'doc_id': doc['doc_id'],
+                    'source_file': doc['file_path']
+                })
+                start = end - self.chunk_overlap
+                if start >= len(text):
+                    break
+        return [c for c in chunks if len(c['content']) > 50]
+class EmbeddingGenerator:
+    def __init__(self, model_name='all-MiniLM-L6-v2'):
+        self.model = SentenceTransformer(model_name)
+    def generate(self, chunks: List[Dict]) -> np.ndarray:
+        texts = [c['content'] for c in chunks]
+        return self.model.encode(texts, batch_size=32, show_progress_bar=False, convert_to_numpy=True)
+    def query_embedding(self, query: str) -> np.ndarray:
+        return self.model.encode([query], convert_to_numpy=True)[0]
+# ========================================
+# RETRIEVER
+# ========================================
+class DocumentRetriever:
+    def __init__(self):
+        self.chunks = []
+        self.index = None
+        self.embedder = EmbeddingGenerator()
+    def build_index(self, chunks: List[Dict], embeddings: np.ndarray):
+        self.chunks = chunks
+        dim = embeddings.shape[1]
+        self.index = faiss.IndexFlatIP(dim)
+        normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
+        self.index.add(normalized.astype('float32'))
+    def search(self, query: str, k: int = 8) -> List[Dict]:
+        if not self.index:
+            return []
+        q_emb = self.embedder.query_embedding(query)
+        q_norm = q_emb / np.linalg.norm(q_emb)
+        scores, indices = self.index.search(q_norm.reshape(1, -1).astype('float32'), k)
+        results = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx < len(self.chunks):
+                chunk = self.chunks[idx].copy()
+                chunk['similarity'] = float(score)
+                results.append(chunk)
+        return results
+# ========================================
+# AGENT TOOLS
+# ========================================
+class AgenticTools:
+    def __init__(self):
+        self.web_search = WebSearchTool()
+    def calculator(self, expr: str):
+        try:
+            safe_expr = re.sub(r'[^0-9+\-*/(). ]', '', expr)
+            result = eval(ast.parse(safe_expr, mode='eval').body, {"__builtins__": {}})
+            return {"success": True, "result": result}
+        except:
+            return {"success": False, "result": "Invalid math"}
+    def web_search(self, query: str):
+        return {"success": True, "result": self.web_search.search(query)}
+# ========================================
+# MAIN AGENT
+# ========================================
+class AgenticRAGAgent:
+    def __init__(self):
+        self.retriever = None
+        self.groq = Groq(api_key=os.getenv("GROQ_API_KEY")) if os.getenv("GROQ_API_KEY") else None
+        self.tools = AgenticTools()
+    def clean_for_speech(self, text: str) -> str:
+        text = re.sub(r'\*\*|\*|_|`|\[.*?\]|\(.*?\)', '', text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+    def generate_voice(self, text: str):
+        if not text or not text.strip():
+            return None
+        clean = self.clean_for_speech(text)
+        try:
+            tts = gTTS(text=clean, lang='en')
+            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+            tts.save(tmp.name)
+            return tmp.name
+        except Exception as e:
+            logger.error(f"TTS failed: {e}")
+            return None
+    def upload_pdfs(self, files):
+        if not files:
+            return "No files uploaded."
+        os.makedirs("sample_data", exist_ok=True)
+        saved = []
+        for file in files:
+            if file.name.lower().endswith(".pdf"):
+                dest = os.path.join("sample_data", os.path.basename(file.name))
+                with open(dest, "wb") as f:
+                    f.write(file.read() if callable(getattr(file, 'read', None)) else file)
+                saved.append(dest)
+        if not saved:
+            return "No valid PDF files."
+        # Process documents
+        processor = DocumentProcessor()
+        chunker = DocumentChunker()
+        docs = processor.load_documents("sample_data")
+        chunks = chunker.chunk_documents(docs)
+        embedder = EmbeddingGenerator()
+        embeddings = embedder.generate(chunks)
+        self.retriever = DocumentRetriever()
+        self.retriever.build_index(chunks, embeddings)
+        return f"Loaded {len(saved)} PDFs → {len(chunks)} chunks ready! Ask anything."
+    def answer_query(self, query: str, history: list):
+        if not query.strip():
+            return history, None
+        if not history:
+            history = []
+        # Greeting
+        if query.strip().lower() in ["hi", "hello", "hey", "hola"]:
+            resp = "Hello! I'm your AI Research Agent with voice answers. Upload PDFs and ask complex questions!"
+            history.append([query, resp])
+            audio = self.generate_voice(resp)
+            return history, audio
+        if not self.retriever:
+            resp = "Please upload at least one PDF document first!"
+            history.append([query, resp])
+            return history, None
+        # Retrieve + Answer
+        docs = self.retriever.search(query, k=8)
+        context = "\n\n".join([d['content'][:1000] for d in docs[:5]])
+        prompt = f"""You are an expert research assistant.
+Context from documents:
+{context}
+Question: {query}
+Provide a clear, accurate, and concise answer. Use bullet points if helpful."""
+        try:
+            if not self.groq:
+                answer = "GROQ_API_KEY not set. Set it in Secrets."
+            else:
+                resp = self.groq.chat.completions.create(
+                    model="llama-3.1-70b-versatile",
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0.3,
+                    max_tokens=800
+                )
+                answer = resp.choices[0].message.content.strip()
+        except Exception as e:
+            answer = f"LLM Error: {str(e)}"
+        history.append([query, answer])
+        audio = self.generate_voice(answer)
+        return history, audio
+# ========================================
+# GRADIO APP
+# ========================================
+def create_app():
+    agent = AgenticRAGAgent()
+    with gr.Blocks(title="AI Research Agent - RAG + Voice", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🤖 AI Research Agent (Agentic RAG + Voice)
+        Upload PDFs → Ask complex questions → Get answers with **voice**
+        """)
+        with gr.Row():
+            with gr.Column(scale=3):
+                chatbot = gr.Chatbot(height=600)
+                msg = gr.Textbox(placeholder="Ask about your documents...", label="Your Question")
+                with gr.Row():
+                    send = gr.Button("Send", variant="primary")
+                    clear = gr.Button("Clear Chat")
+                audio_out = gr.Audio(label="Voice Response", autoplay=True, interactive=False)
+            with gr.Column(scale=1):
+                gr.Markdown("### Upload Research PDFs")
+                file_input = gr.Files(file_types=[".pdf"], file_count="multiple")
+                status = gr.Textbox(label="Status", interactive=False, lines=4)
+        # Events
+        def respond(message, chat_history):
+            new_hist, audio_file = agent.answer_query(message, chat_history)
+            return "", new_hist, audio_file
+        msg.submit(respond, [msg, chatbot], [msg, chatbot, audio_out])
+        send.click(respond, [msg, chatbot], [msg, chatbot, audio_out])
+        clear.click(lambda: ([], None), outputs=[chatbot, audio_out])
+        file_input.change(agent.upload_pdfs, file_input, status)
+        gr.Markdown("**Secret Required:** Add `GROQ_API_KEY` in Space Secrets (free at console.groq.com)")
+    return demo
+# ========================================
+# LAUNCH
+# ========================================
+if __name__ == "__main__":
+    app = create_app()
+    app.launch(server_name="0.0.0.0", server_port=7860)