isana25 commited on
Commit
2fd0797
Β·
verified Β·
1 Parent(s): 06895fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -12
app.py CHANGED
@@ -2,24 +2,47 @@ import gradio as gr
2
  import tempfile
3
  import os
4
  import numpy as np
5
- from utils import extract_text_from_pdf, chunk_text, embed_chunks, build_faiss_index
 
6
  from sentence_transformers import SentenceTransformer
7
  from groq import Groq
8
 
9
- # βœ… Load Groq API Key securely from Hugging Face secrets
10
  groq_api_key = os.getenv("GROQ_API_KEY")
11
  client = Groq(api_key=groq_api_key)
12
 
 
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
 
15
  stored_chunks = []
16
  stored_embeddings = None
17
  stored_index = None
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def handle_pdf(file):
20
  global stored_chunks, stored_embeddings, stored_index
21
 
22
- # Save uploaded PDF
23
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
24
  tmp.write(file.read())
25
  tmp_path = tmp.name
@@ -34,11 +57,10 @@ def handle_pdf(file):
34
  embeddings = embed_chunks(chunks)
35
  token_comment = f"βœ… Tokenization Done: Embeddings shape {embeddings.shape}."
36
 
37
- # Vector DB (FAISS)
38
  index = build_faiss_index(embeddings)
39
  vector_comment = f"βœ… Vector DB Created: FAISS index with {index.ntotal} vectors."
40
 
41
- # Save state
42
  stored_chunks = chunks
43
  stored_embeddings = embeddings
44
  stored_index = index
@@ -50,7 +72,7 @@ def answer_query(query):
50
  return "❌ Please upload and process a PDF first."
51
 
52
  query_vec = model.encode([query])
53
- D, I = stored_index.search(np.array(query_vec), k=3)
54
  top_chunks = [stored_chunks[i] for i in I[0]]
55
 
56
  context = "\n\n".join(top_chunks)
@@ -69,15 +91,15 @@ def answer_query(query):
69
 
70
  # Gradio UI
71
  with gr.Blocks() as demo:
72
- gr.Markdown("# πŸ” RAG App with PDF + Groq + LLaMA")
73
 
74
  with gr.Row():
75
  file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
76
- process_button = gr.Button("πŸ“„ Process PDF")
77
 
78
- chunk_output = gr.Textbox(label="Chunking Output")
79
- token_output = gr.Textbox(label="Tokenization Output")
80
- vector_output = gr.Textbox(label="Vector DB Output")
81
 
82
  process_button.click(
83
  fn=handle_pdf,
@@ -85,7 +107,7 @@ with gr.Blocks() as demo:
85
  outputs=[chunk_output, token_output, vector_output]
86
  )
87
 
88
- gr.Markdown("## πŸ’¬ Ask Questions About the Document")
89
 
90
  question_input = gr.Textbox(label="Your Question")
91
  ask_button = gr.Button("πŸ€– Ask")
 
2
  import tempfile
3
  import os
4
  import numpy as np
5
+ import fitz # PyMuPDF
6
+ import faiss
7
  from sentence_transformers import SentenceTransformer
8
  from groq import Groq
9
 
10
+ # βœ… Load Groq API key securely
11
  groq_api_key = os.getenv("GROQ_API_KEY")
12
  client = Groq(api_key=groq_api_key)
13
 
14
+ # Load embedding model
15
  model = SentenceTransformer('all-MiniLM-L6-v2')
16
 
17
  stored_chunks = []
18
  stored_embeddings = None
19
  stored_index = None
20
 
21
+ def extract_text_from_pdf(pdf_path):
22
+ doc = fitz.open(pdf_path)
23
+ text = ""
24
+ for page in doc:
25
+ text += page.get_text()
26
+ return text
27
+
28
+ def chunk_text(text, max_chunk_size=500):
29
+ words = text.split()
30
+ chunks = [' '.join(words[i:i+max_chunk_size]) for i in range(0, len(words), max_chunk_size)]
31
+ return chunks
32
+
33
+ def embed_chunks(chunks):
34
+ embeddings = model.encode(chunks)
35
+ return np.array(embeddings)
36
+
37
+ def build_faiss_index(embeddings):
38
+ dimension = embeddings.shape[1]
39
+ index = faiss.IndexFlatL2(dimension)
40
+ index.add(embeddings)
41
+ return index
42
+
43
  def handle_pdf(file):
44
  global stored_chunks, stored_embeddings, stored_index
45
 
 
46
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
47
  tmp.write(file.read())
48
  tmp_path = tmp.name
 
57
  embeddings = embed_chunks(chunks)
58
  token_comment = f"βœ… Tokenization Done: Embeddings shape {embeddings.shape}."
59
 
60
+ # Vector DB
61
  index = build_faiss_index(embeddings)
62
  vector_comment = f"βœ… Vector DB Created: FAISS index with {index.ntotal} vectors."
63
 
 
64
  stored_chunks = chunks
65
  stored_embeddings = embeddings
66
  stored_index = index
 
72
  return "❌ Please upload and process a PDF first."
73
 
74
  query_vec = model.encode([query])
75
+ D, I = stored_index.search(np.array([query_vec]), k=3)
76
  top_chunks = [stored_chunks[i] for i in I[0]]
77
 
78
  context = "\n\n".join(top_chunks)
 
91
 
92
  # Gradio UI
93
  with gr.Blocks() as demo:
94
+ gr.Markdown("# πŸ“„ RAG PDF Chat with Groq + LLaMA")
95
 
96
  with gr.Row():
97
  file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
98
+ process_button = gr.Button("πŸ“₯ Process PDF")
99
 
100
+ chunk_output = gr.Textbox(label="Chunking Status")
101
+ token_output = gr.Textbox(label="Tokenization Status")
102
+ vector_output = gr.Textbox(label="Vector DB Status")
103
 
104
  process_button.click(
105
  fn=handle_pdf,
 
107
  outputs=[chunk_output, token_output, vector_output]
108
  )
109
 
110
+ gr.Markdown("## πŸ’¬ Ask a Question About the Document")
111
 
112
  question_input = gr.Textbox(label="Your Question")
113
  ask_button = gr.Button("πŸ€– Ask")