Spaces:

isana25
/

RAG_Application

Sleeping

App Files Files Community

isana25 commited on May 14, 2025

Commit

2fd0797

verified ·

1 Parent(s): 06895fd

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -12

app.py CHANGED Viewed

@@ -2,24 +2,47 @@ import gradio as gr
 import tempfile
 import os
 import numpy as np
-from utils import extract_text_from_pdf, chunk_text, embed_chunks, build_faiss_index
 from sentence_transformers import SentenceTransformer
 from groq import Groq
-# ✅ Load Groq API Key securely from Hugging Face secrets
 groq_api_key = os.getenv("GROQ_API_KEY")
 client = Groq(api_key=groq_api_key)
 model = SentenceTransformer('all-MiniLM-L6-v2')
 stored_chunks = []
 stored_embeddings = None
 stored_index = None
 def handle_pdf(file):
     global stored_chunks, stored_embeddings, stored_index
-    # Save uploaded PDF
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
         tmp.write(file.read())
         tmp_path = tmp.name
@@ -34,11 +57,10 @@ def handle_pdf(file):
     embeddings = embed_chunks(chunks)
     token_comment = f"✅ Tokenization Done: Embeddings shape {embeddings.shape}."
-    # Vector DB (FAISS)
     index = build_faiss_index(embeddings)
     vector_comment = f"✅ Vector DB Created: FAISS index with {index.ntotal} vectors."
-    # Save state
     stored_chunks = chunks
     stored_embeddings = embeddings
     stored_index = index
@@ -50,7 +72,7 @@ def answer_query(query):
         return "❌ Please upload and process a PDF first."
     query_vec = model.encode([query])
-    D, I = stored_index.search(np.array(query_vec), k=3)
     top_chunks = [stored_chunks[i] for i in I[0]]
     context = "\n\n".join(top_chunks)
@@ -69,15 +91,15 @@ def answer_query(query):
 # Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("# 🔍 RAG App with PDF + Groq + LLaMA")
     with gr.Row():
         file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-        process_button = gr.Button("📄 Process PDF")
-    chunk_output = gr.Textbox(label="Chunking Output")
-    token_output = gr.Textbox(label="Tokenization Output")
-    vector_output = gr.Textbox(label="Vector DB Output")
     process_button.click(
         fn=handle_pdf,
@@ -85,7 +107,7 @@ with gr.Blocks() as demo:
         outputs=[chunk_output, token_output, vector_output]
     )
-    gr.Markdown("## 💬 Ask Questions About the Document")
     question_input = gr.Textbox(label="Your Question")
     ask_button = gr.Button("🤖 Ask")

 import tempfile
 import os
 import numpy as np
+import fitz  # PyMuPDF
+import faiss
 from sentence_transformers import SentenceTransformer
 from groq import Groq
+# ✅ Load Groq API key securely
 groq_api_key = os.getenv("GROQ_API_KEY")
 client = Groq(api_key=groq_api_key)
+# Load embedding model
 model = SentenceTransformer('all-MiniLM-L6-v2')
 stored_chunks = []
 stored_embeddings = None
 stored_index = None
+def extract_text_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    return text
+def chunk_text(text, max_chunk_size=500):
+    words = text.split()
+    chunks = [' '.join(words[i:i+max_chunk_size]) for i in range(0, len(words), max_chunk_size)]
+    return chunks
+def embed_chunks(chunks):
+    embeddings = model.encode(chunks)
+    return np.array(embeddings)
+def build_faiss_index(embeddings):
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(embeddings)
+    return index
 def handle_pdf(file):
     global stored_chunks, stored_embeddings, stored_index
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
         tmp.write(file.read())
         tmp_path = tmp.name
     embeddings = embed_chunks(chunks)
     token_comment = f"✅ Tokenization Done: Embeddings shape {embeddings.shape}."
+    # Vector DB
     index = build_faiss_index(embeddings)
     vector_comment = f"✅ Vector DB Created: FAISS index with {index.ntotal} vectors."
     stored_chunks = chunks
     stored_embeddings = embeddings
     stored_index = index
         return "❌ Please upload and process a PDF first."
     query_vec = model.encode([query])
+    D, I = stored_index.search(np.array([query_vec]), k=3)
     top_chunks = [stored_chunks[i] for i in I[0]]
     context = "\n\n".join(top_chunks)
 # Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("# 📄 RAG PDF Chat with Groq + LLaMA")
     with gr.Row():
         file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+        process_button = gr.Button("📥 Process PDF")
+    chunk_output = gr.Textbox(label="Chunking Status")
+    token_output = gr.Textbox(label="Tokenization Status")
+    vector_output = gr.Textbox(label="Vector DB Status")
     process_button.click(
         fn=handle_pdf,
         outputs=[chunk_output, token_output, vector_output]
     )
+    gr.Markdown("## 💬 Ask a Question About the Document")
     question_input = gr.Textbox(label="Your Question")
     ask_button = gr.Button("🤖 Ask")