Try / app.py
umarcui's picture
Create app.py
00b830f verified
import gradio as gr
import fitz # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import os
import requests
import json
from typing import List
# Load the sentence transformer model
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# Initialize FAISS index
dimension = 384 # vector size for MiniLM-L6-v2
index = faiss.IndexFlatL2(dimension)
stored_chunks = []
stored_embeddings = []
# Set your Groq API key here (safe in Colab if you use secrets or input())
GROQ_API_KEY = "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"
LLM_MODEL = "llama3-8b-8192"
def extract_text_from_pdf(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = words[i:i + chunk_size]
chunks.append(" ".join(chunk))
return chunks
def embed_and_store(chunks):
global stored_chunks, stored_embeddings
embeddings = embedder.encode(chunks)
index.add(np.array(embeddings, dtype=np.float32))
stored_chunks.extend(chunks)
stored_embeddings.extend(embeddings)
def query_groq(prompt):
url = "https://api.groq.com/openai/v1/chat/completions"
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": LLM_MODEL,
"messages": [
{"role": "system", "content": "You are a helpful academic supervisor helping students study uploaded research papers."},
{"role": "user", "content": prompt}
],
"temperature": 0.2
}
response = requests.post(url, headers=headers, json=payload)
return response.json()["choices"][0]["message"]["content"]
def retrieve_answer(user_query):
embedded_query = embedder.encode([user_query])
D, I = index.search(np.array(embedded_query, dtype=np.float32), k=3)
context = "\n\n".join([stored_chunks[i] for i in I[0]])
prompt = f"Based on the following context:\n\n{context}\n\nAnswer this question:\n{user_query}"
return query_groq(prompt)
def handle_upload(file):
text = extract_text_from_pdf(file)
chunks = chunk_text(text)
embed_and_store(chunks)
return "PDF processed and indexed. You can now ask questions."
def handle_question(question):
if not stored_chunks:
return "Please upload a PDF first."
return retrieve_answer(question)
with gr.Blocks() as demo:
with gr.Row():
file_input = gr.File(label="Upload your PDF")
upload_btn = gr.Button("Process PDF")
output_text = gr.Textbox(label="Status / Answer")
upload_btn.click(fn=handle_upload, inputs=file_input, outputs=output_text)
with gr.Row():
query_input = gr.Textbox(label="Ask a Question")
query_btn = gr.Button("Submit")
query_btn.click(fn=handle_question, inputs=query_input, outputs=output_text)
demo.launch()