Spaces:

HuzaifaTech
/

Multi_documents

Sleeping

App Files Files Community

HuzaifaTech commited on 22 days ago

Commit

933c098

verified ·

1 Parent(s): c789ee4

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -58

app.py CHANGED Viewed

@@ -10,7 +10,8 @@ from groq import Groq
 # =========================
 # 🔑 GROQ API (HF SECRET)
 # =========================
-client = Groq(api_key=os.getenv("Multi_doc"))
 # =========================
 # 📄 LOAD DOCUMENTS
@@ -24,14 +25,18 @@ def load_docx(path):
     return "\n".join([p.text for p in doc.paragraphs])
 def load_txt(path):
-    return open(path, "r", encoding="utf-8").read()
 def load_document(path):
     ext = path.split(".")[-1].lower()
-    if ext == "pdf": return load_pdf(path)
-    if ext == "docx": return load_docx(path)
-    if ext == "txt": return load_txt(path)
-    raise ValueError("Unsupported file type")
 # =========================
 # ✂️ CHUNKING
@@ -45,7 +50,7 @@ def chunk_text(text, size=400, overlap=80):
     while i < len(words):
         chunks.append({
             "id": cid,
-            "text": " ".join(words[i:i+size])
         })
         i += size - overlap
         cid += 1
@@ -55,36 +60,49 @@ def chunk_text(text, size=400, overlap=80):
 # =========================
 # 🧠 EMBEDDINGS (LOCAL)
 # =========================
-model = SentenceTransformer("all-MiniLM-L6-v2")
 def embed(texts):
-    return model.encode(texts).tolist()
 # =========================
-# 🗄️ CHROMA DB (PERSISTENT)
 # =========================
-client_db = chromadb.PersistentClient(path="./chroma_db")
-collection = client_db.get_or_create_collection("rag")
 # =========================
 # 📁 PROCESS FILES
 # =========================
 def process_files(files):
     if not files:
-        return "⚠️ No files uploaded"
     all_chunks = []
     for f in files:
-        text = load_document(f.name)
-        chunks = chunk_text(text)
-        for c in chunks:
-            all_chunks.append({
-                "source": f.name,
-                "text": c["text"]
-            })
     texts = [c["text"] for c in all_chunks]
     embeddings = embed(texts)
@@ -96,13 +114,21 @@ def process_files(files):
         metadatas=[{"source": c["source"]} for c in all_chunks]
     )
-    return f"✅ Indexed {len(files)} file(s) successfully!"
 # =========================
 # 🔍 RETRIEVAL
 # =========================
 def retrieve(query, k=3):
     q_emb = embed([query])[0]
     results = collection.query(
@@ -123,18 +149,18 @@ def retrieve(query, k=3):
 # 🤖 GROQ GENERATION
 # =========================
 def generate(query):
     docs = retrieve(query)
     context = "\n\n".join(
         [f"[{d['source']}]\n{d['text']}" for d in docs]
     )
-    prompt = f"""
-You are a strict RAG assistant.
 Answer ONLY from the context below.
-If not found, say: "Not found in documents."
 CONTEXT:
 {context}
@@ -142,57 +168,79 @@ CONTEXT:
 QUESTION:
 {query}
-ANSWER:
-"""
-    response = client.chat.completions.create(
-        model="llama3-8b-8192",
-        messages=[
-            {"role": "user", "content": prompt}
-        ]
-    )
-    answer = response.choices[0].message.content
     sources = "\n\n".join(
-        [f"📄 {d['source']}\n{d['text'][:200]}" for d in docs]
     )
-    return answer + "\n\n---\n📚 Sources:\n" + sources
 # =========================
 # 💬 CHAT FUNCTION
 # =========================
 def chat(message, history):
     reply = generate(message)
     history.append((message, reply))
     return "", history
 # =========================
-# 🎨 UI (HF GRADIO)
 # =========================
-with gr.Blocks() as app:
-    gr.Markdown("# 🧠 Groq RAG Assistant (HF Deployable)")
     with gr.Row():
         with gr.Column(scale=1):
-            files = gr.File(file_count="multiple", file_types=[".pdf", ".docx", ".txt"])
-            process_btn = gr.Button("🚀 Process Files")
-            status = gr.Textbox()
-            process_btn.click(process_files, files, status)
         with gr.Column(scale=2):
-            chatbot = gr.Chatbot(height=500)
-            msg = gr.Textbox(placeholder="Ask your documents...")
-            msg.submit(chat, [msg, chatbot], [msg, chatbot])
-# =========================
-# 🚀 RUN
-# =========================
-app.launch()

 # =========================
 # 🔑 GROQ API (HF SECRET)
 # =========================
+# Set your secret as "GROQ_API_KEY" in HF Space Settings → Variables and secrets
+groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 # =========================
 # 📄 LOAD DOCUMENTS
     return "\n".join([p.text for p in doc.paragraphs])
 def load_txt(path):
+    with open(path, "r", encoding="utf-8") as f:
+        return f.read()
 def load_document(path):
     ext = path.split(".")[-1].lower()
+    if ext == "pdf":
+        return load_pdf(path)
+    if ext == "docx":
+        return load_docx(path)
+    if ext == "txt":
+        return load_txt(path)
+    raise ValueError(f"Unsupported file type: .{ext}")
 # =========================
 # ✂️ CHUNKING
     while i < len(words):
         chunks.append({
             "id": cid,
+            "text": " ".join(words[i:i + size])
         })
         i += size - overlap
         cid += 1
 # =========================
 # 🧠 EMBEDDINGS (LOCAL)
 # =========================
+embed_model = SentenceTransformer("all-MiniLM-L6-v2")
 def embed(texts):
+    return embed_model.encode(texts, show_progress_bar=False).tolist()
 # =========================
+# 🗄️ CHROMA DB
+# HF Spaces has a read-only root — use /tmp for writable storage
 # =========================
+chroma_client = chromadb.PersistentClient(path="/tmp/chroma_db")
+collection = chroma_client.get_or_create_collection("rag")
 # =========================
 # 📁 PROCESS FILES
 # =========================
 def process_files(files):
     if not files:
+        return "⚠️ No files uploaded."
     all_chunks = []
+    errors = []
     for f in files:
+        # Gradio on HF passes file path as a string or NamedString
+        file_path = f if isinstance(f, str) else f.name
+        if not file_path:
+            continue
+        try:
+            text = load_document(file_path)
+            if not text.strip():
+                errors.append(f"⚠️ {os.path.basename(file_path)} appears empty.")
+                continue
+            chunks = chunk_text(text)
+            for c in chunks:
+                all_chunks.append({
+                    "source": os.path.basename(file_path),
+                    "text": c["text"]
+                })
+        except Exception as e:
+            errors.append(f"❌ Error reading {os.path.basename(file_path)}: {e}")
+    if not all_chunks:
+        return "\n".join(errors) if errors else "⚠️ No content could be extracted."
     texts = [c["text"] for c in all_chunks]
     embeddings = embed(texts)
         metadatas=[{"source": c["source"]} for c in all_chunks]
     )
+    result = f"✅ Indexed {len(files)} file(s) — {len(all_chunks)} chunks stored."
+    if errors:
+        result += "\n" + "\n".join(errors)
+    return result
 # =========================
 # 🔍 RETRIEVAL
 # =========================
 def retrieve(query, k=3):
+    # Guard: collection might be empty
+    count = collection.count()
+    if count == 0:
+        return []
+    k = min(k, count)  # Can't retrieve more than what's stored
     q_emb = embed([query])[0]
     results = collection.query(
 # 🤖 GROQ GENERATION
 # =========================
 def generate(query):
     docs = retrieve(query)
+    if not docs:
+        return "⚠️ No documents indexed yet. Please upload and process files first."
     context = "\n\n".join(
         [f"[{d['source']}]\n{d['text']}" for d in docs]
     )
+    prompt = f"""You are a strict RAG assistant.
 Answer ONLY from the context below.
+If the answer is not found in the context, say: "Not found in documents."
 CONTEXT:
 {context}
 QUESTION:
 {query}
+ANSWER:"""
+    try:
+        response = groq_client.chat.completions.create(
+            model="llama3-8b-8192",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.2,
+            max_tokens=1024,
+        )
+        answer = response.choices[0].message.content
+    except Exception as e:
+        return f"❌ Groq API error: {e}"
     sources = "\n\n".join(
+        [f"📄 **{d['source']}**\n{d['text'][:200]}…" for d in docs]
     )
+    return f"{answer}\n\n---\n📚 **Sources:**\n{sources}"
 # =========================
 # 💬 CHAT FUNCTION
 # =========================
 def chat(message, history):
+    if not message.strip():
+        return "", history
     reply = generate(message)
     history.append((message, reply))
     return "", history
 # =========================
+# 🎨 GRADIO UI
 # =========================
+with gr.Blocks(title="Groq RAG Assistant") as app:
+    gr.Markdown(
+        """# 🧠 Groq RAG Assistant
+        Upload your documents, then ask questions about them.
+        Powered by **Groq LLaMA3** + **ChromaDB** + **sentence-transformers**.
+        """
+    )
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown("### 📂 Upload Documents")
+            files = gr.File(
+                file_count="multiple",
+                file_types=[".pdf", ".docx", ".txt"],
+                label="Upload PDF / DOCX / TXT"
+            )
+            process_btn = gr.Button("🚀 Process Files", variant="primary")
+            status = gr.Textbox(label="Status", interactive=False)
+            process_btn.click(fn=process_files, inputs=files, outputs=status)
         with gr.Column(scale=2):
+            gr.Markdown("### 💬 Ask Your Documents")
+            chatbot = gr.Chatbot(height=480, bubble_full_width=False)
+            msg = gr.Textbox(
+                placeholder="Ask a question about your documents…",
+                label="Your question",
+                lines=2
+            )
+            with gr.Row():
+                submit_btn = gr.Button("Send", variant="primary")
+                clear_btn = gr.Button("Clear Chat")
+            submit_btn.click(fn=chat, inputs=[msg, chatbot], outputs=[msg, chatbot])
+            msg.submit(fn=chat, inputs=[msg, chatbot], outputs=[msg, chatbot])
+            clear_btn.click(fn=lambda: ([], ""), outputs=[chatbot, msg])
+# =========================
+# 🚀 LAUNCH
+# =========================
+if __name__ == "__main__":
+    app.launch()