isana25 commited on
Commit
97f8372
Β·
verified Β·
1 Parent(s): 298ec1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -56
app.py CHANGED
@@ -7,7 +7,7 @@ import faiss
7
  from sentence_transformers import SentenceTransformer
8
  from groq import Groq
9
 
10
- # βœ… Load Groq API key securely
11
  groq_api_key = os.getenv("GROQ_API_KEY")
12
  client = Groq(api_key=groq_api_key)
13
 
@@ -15,7 +15,6 @@ client = Groq(api_key=groq_api_key)
15
  model = SentenceTransformer('all-MiniLM-L6-v2')
16
 
17
  stored_chunks = []
18
- stored_embeddings = None
19
  stored_index = None
20
 
21
  def extract_text_from_pdf(pdf_path):
@@ -25,23 +24,8 @@ def extract_text_from_pdf(pdf_path):
25
  text += page.get_text()
26
  return text
27
 
28
- def chunk_text(text, max_chunk_size=500):
29
- words = text.split()
30
- chunks = [' '.join(words[i:i+max_chunk_size]) for i in range(0, len(words), max_chunk_size)]
31
- return chunks
32
-
33
- def embed_chunks(chunks):
34
- embeddings = model.encode(chunks)
35
- return np.array(embeddings)
36
-
37
- def build_faiss_index(embeddings):
38
- dimension = embeddings.shape[1]
39
- index = faiss.IndexFlatL2(dimension)
40
- index.add(embeddings)
41
- return index
42
-
43
  def handle_pdf(file):
44
- global stored_chunks, stored_embeddings, stored_index
45
 
46
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
47
  tmp.write(file.read())
@@ -49,32 +33,24 @@ def handle_pdf(file):
49
 
50
  text = extract_text_from_pdf(tmp_path)
51
 
52
- # Chunking
53
- chunks = chunk_text(text)
54
- chunk_comment = f"βœ… Chunking Done: {len(chunks)} chunks created."
55
-
56
- # Tokenization
57
- embeddings = embed_chunks(chunks)
58
- token_comment = f"βœ… Tokenization Done: Embeddings shape {embeddings.shape}."
59
-
60
- # Vector DB
61
- index = build_faiss_index(embeddings)
62
- vector_comment = f"βœ… Vector DB Created: FAISS index with {index.ntotal} vectors."
63
 
64
  stored_chunks = chunks
65
- stored_embeddings = embeddings
66
  stored_index = index
67
 
68
- return chunk_comment, token_comment, vector_comment
69
 
70
  def answer_query(query):
71
- if stored_index is None or not stored_chunks:
72
  return "❌ Please upload and process a PDF first."
73
 
74
  query_vec = model.encode([query])
75
  D, I = stored_index.search(np.array([query_vec]), k=3)
76
  top_chunks = [stored_chunks[i] for i in I[0]]
77
-
78
  context = "\n\n".join(top_chunks)
79
 
80
  prompt = f"""Answer the question based on the context below:\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"""
@@ -91,32 +67,18 @@ def answer_query(query):
91
 
92
  # Gradio UI
93
  with gr.Blocks() as demo:
94
- gr.Markdown("# πŸ“„ RAG PDF Chat with Groq + LLaMA")
95
-
96
- with gr.Row():
97
- file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
98
- process_button = gr.Button("πŸ“₯ Process PDF")
99
-
100
- chunk_output = gr.Textbox(label="Chunking Status")
101
- token_output = gr.Textbox(label="Tokenization Status")
102
- vector_output = gr.Textbox(label="Vector DB Status")
103
 
104
- process_button.click(
105
- fn=handle_pdf,
106
- inputs=[file_input],
107
- outputs=[chunk_output, token_output, vector_output]
108
- )
109
 
110
- gr.Markdown("## πŸ’¬ Ask a Question About the Document")
111
 
112
- question_input = gr.Textbox(label="Your Question")
113
- ask_button = gr.Button("πŸ€– Ask")
114
- answer_output = gr.Textbox(label="Answer", lines=5)
 
115
 
116
- ask_button.click(
117
- fn=answer_query,
118
- inputs=[question_input],
119
- outputs=[answer_output]
120
- )
121
 
122
  demo.launch()
 
7
  from sentence_transformers import SentenceTransformer
8
  from groq import Groq
9
 
10
+ # βœ… Load Groq API key securely from Hugging Face secret
11
  groq_api_key = os.getenv("GROQ_API_KEY")
12
  client = Groq(api_key=groq_api_key)
13
 
 
15
  model = SentenceTransformer('all-MiniLM-L6-v2')
16
 
17
  stored_chunks = []
 
18
  stored_index = None
19
 
20
  def extract_text_from_pdf(pdf_path):
 
24
  text += page.get_text()
25
  return text
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def handle_pdf(file):
28
+ global stored_chunks, stored_index
29
 
30
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
31
  tmp.write(file.read())
 
33
 
34
  text = extract_text_from_pdf(tmp_path)
35
 
36
+ # Chunk and embed
37
+ chunks = [' '.join(text.split()[i:i+500]) for i in range(0, len(text.split()), 500)]
38
+ embeddings = model.encode(chunks)
39
+ index = faiss.IndexFlatL2(embeddings.shape[1])
40
+ index.add(np.array(embeddings))
 
 
 
 
 
 
41
 
42
  stored_chunks = chunks
 
43
  stored_index = index
44
 
45
+ return "βœ… PDF processed successfully. You can now ask a question."
46
 
47
  def answer_query(query):
48
+ if not stored_chunks or stored_index is None:
49
  return "❌ Please upload and process a PDF first."
50
 
51
  query_vec = model.encode([query])
52
  D, I = stored_index.search(np.array([query_vec]), k=3)
53
  top_chunks = [stored_chunks[i] for i in I[0]]
 
54
  context = "\n\n".join(top_chunks)
55
 
56
  prompt = f"""Answer the question based on the context below:\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"""
 
67
 
68
  # Gradio UI
69
  with gr.Blocks() as demo:
70
+ gr.Markdown("# πŸ“„ Ask Your PDF - Powered by Groq + LLaMA")
 
 
 
 
 
 
 
 
71
 
72
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
73
+ status_output = gr.Textbox(label="Status")
 
 
 
74
 
75
+ file_input.change(fn=handle_pdf, inputs=file_input, outputs=status_output)
76
 
77
+ gr.Markdown("## πŸ’¬ Ask a Question About Your PDF")
78
+ question = gr.Textbox(label="Your Question")
79
+ ask_button = gr.Button("Ask")
80
+ answer = gr.Textbox(label="Answer", lines=5)
81
 
82
+ ask_button.click(fn=answer_query, inputs=question, outputs=answer)
 
 
 
 
83
 
84
  demo.launch()