umarcui commited on
Commit
00b830f
·
verified ·
1 Parent(s): e2536be

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -0
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import faiss
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ import os
7
+ import requests
8
+ import json
9
+ from typing import List
10
+
11
+ # Load the sentence transformer model
12
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
13
+
14
+ # Initialize FAISS index
15
+ dimension = 384 # vector size for MiniLM-L6-v2
16
+ index = faiss.IndexFlatL2(dimension)
17
+ stored_chunks = []
18
+ stored_embeddings = []
19
+
20
+ # Set your Groq API key here (safe in Colab if you use secrets or input())
21
+ GROQ_API_KEY = "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"
22
+ LLM_MODEL = "llama3-8b-8192"
23
+
24
+ def extract_text_from_pdf(pdf_file):
25
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
26
+ text = ""
27
+ for page in doc:
28
+ text += page.get_text()
29
+ return text
30
+
31
+ def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
32
+ words = text.split()
33
+ chunks = []
34
+ for i in range(0, len(words), chunk_size - overlap):
35
+ chunk = words[i:i + chunk_size]
36
+ chunks.append(" ".join(chunk))
37
+ return chunks
38
+
39
+ def embed_and_store(chunks):
40
+ global stored_chunks, stored_embeddings
41
+ embeddings = embedder.encode(chunks)
42
+ index.add(np.array(embeddings, dtype=np.float32))
43
+ stored_chunks.extend(chunks)
44
+ stored_embeddings.extend(embeddings)
45
+
46
+ def query_groq(prompt):
47
+ url = "https://api.groq.com/openai/v1/chat/completions"
48
+ headers = {
49
+ "Authorization": f"Bearer {GROQ_API_KEY}",
50
+ "Content-Type": "application/json"
51
+ }
52
+
53
+ payload = {
54
+ "model": LLM_MODEL,
55
+ "messages": [
56
+ {"role": "system", "content": "You are a helpful academic supervisor helping students study uploaded research papers."},
57
+ {"role": "user", "content": prompt}
58
+ ],
59
+ "temperature": 0.2
60
+ }
61
+
62
+ response = requests.post(url, headers=headers, json=payload)
63
+ return response.json()["choices"][0]["message"]["content"]
64
+
65
+ def retrieve_answer(user_query):
66
+ embedded_query = embedder.encode([user_query])
67
+ D, I = index.search(np.array(embedded_query, dtype=np.float32), k=3)
68
+ context = "\n\n".join([stored_chunks[i] for i in I[0]])
69
+ prompt = f"Based on the following context:\n\n{context}\n\nAnswer this question:\n{user_query}"
70
+ return query_groq(prompt)
71
+
72
+ def handle_upload(file):
73
+ text = extract_text_from_pdf(file)
74
+ chunks = chunk_text(text)
75
+ embed_and_store(chunks)
76
+ return "PDF processed and indexed. You can now ask questions."
77
+
78
+ def handle_question(question):
79
+ if not stored_chunks:
80
+ return "Please upload a PDF first."
81
+ return retrieve_answer(question)
82
+
83
+ with gr.Blocks() as demo:
84
+ with gr.Row():
85
+ file_input = gr.File(label="Upload your PDF")
86
+ upload_btn = gr.Button("Process PDF")
87
+ output_text = gr.Textbox(label="Status / Answer")
88
+
89
+ upload_btn.click(fn=handle_upload, inputs=file_input, outputs=output_text)
90
+
91
+ with gr.Row():
92
+ query_input = gr.Textbox(label="Ask a Question")
93
+ query_btn = gr.Button("Submit")
94
+ query_btn.click(fn=handle_question, inputs=query_input, outputs=output_text)
95
+
96
+ demo.launch()