|
|
import gradio as gr |
|
|
import fitz |
|
|
import faiss |
|
|
import numpy as np |
|
|
from sentence_transformers import SentenceTransformer |
|
|
import os |
|
|
import requests |
|
|
import json |
|
|
from typing import List |
|
|
|
|
|
|
|
|
embedder = SentenceTransformer("all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
dimension = 384 |
|
|
index = faiss.IndexFlatL2(dimension) |
|
|
stored_chunks = [] |
|
|
stored_embeddings = [] |
|
|
|
|
|
|
|
|
GROQ_API_KEY = "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" |
|
|
LLM_MODEL = "llama3-8b-8192" |
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
|
doc = fitz.open(stream=pdf_file.read(), filetype="pdf") |
|
|
text = "" |
|
|
for page in doc: |
|
|
text += page.get_text() |
|
|
return text |
|
|
|
|
|
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]: |
|
|
words = text.split() |
|
|
chunks = [] |
|
|
for i in range(0, len(words), chunk_size - overlap): |
|
|
chunk = words[i:i + chunk_size] |
|
|
chunks.append(" ".join(chunk)) |
|
|
return chunks |
|
|
|
|
|
def embed_and_store(chunks): |
|
|
global stored_chunks, stored_embeddings |
|
|
embeddings = embedder.encode(chunks) |
|
|
index.add(np.array(embeddings, dtype=np.float32)) |
|
|
stored_chunks.extend(chunks) |
|
|
stored_embeddings.extend(embeddings) |
|
|
|
|
|
def query_groq(prompt): |
|
|
url = "https://api.groq.com/openai/v1/chat/completions" |
|
|
headers = { |
|
|
"Authorization": f"Bearer {GROQ_API_KEY}", |
|
|
"Content-Type": "application/json" |
|
|
} |
|
|
|
|
|
payload = { |
|
|
"model": LLM_MODEL, |
|
|
"messages": [ |
|
|
{"role": "system", "content": "You are a helpful academic supervisor helping students study uploaded research papers."}, |
|
|
{"role": "user", "content": prompt} |
|
|
], |
|
|
"temperature": 0.2 |
|
|
} |
|
|
|
|
|
response = requests.post(url, headers=headers, json=payload) |
|
|
return response.json()["choices"][0]["message"]["content"] |
|
|
|
|
|
def retrieve_answer(user_query): |
|
|
embedded_query = embedder.encode([user_query]) |
|
|
D, I = index.search(np.array(embedded_query, dtype=np.float32), k=3) |
|
|
context = "\n\n".join([stored_chunks[i] for i in I[0]]) |
|
|
prompt = f"Based on the following context:\n\n{context}\n\nAnswer this question:\n{user_query}" |
|
|
return query_groq(prompt) |
|
|
|
|
|
def handle_upload(file): |
|
|
text = extract_text_from_pdf(file) |
|
|
chunks = chunk_text(text) |
|
|
embed_and_store(chunks) |
|
|
return "PDF processed and indexed. You can now ask questions." |
|
|
|
|
|
def handle_question(question): |
|
|
if not stored_chunks: |
|
|
return "Please upload a PDF first." |
|
|
return retrieve_answer(question) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
with gr.Row(): |
|
|
file_input = gr.File(label="Upload your PDF") |
|
|
upload_btn = gr.Button("Process PDF") |
|
|
output_text = gr.Textbox(label="Status / Answer") |
|
|
|
|
|
upload_btn.click(fn=handle_upload, inputs=file_input, outputs=output_text) |
|
|
|
|
|
with gr.Row(): |
|
|
query_input = gr.Textbox(label="Ask a Question") |
|
|
query_btn = gr.Button("Submit") |
|
|
query_btn.click(fn=handle_question, inputs=query_input, outputs=output_text) |
|
|
|
|
|
demo.launch() |