Rishitha3 commited on
Commit
9bf28bf
·
verified ·
1 Parent(s): 2a6a2d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -149
app.py CHANGED
@@ -1,158 +1,96 @@
1
  import gradio as gr
2
- import fitz # PyMuPDF
3
- import re
4
- import numpy as np
5
  import faiss
6
  import os
7
  from sentence_transformers import SentenceTransformer
8
- from transformers import AutoTokenizer, AutoModelForCausalLM
9
- import torch
10
- from huggingface_hub import login
11
 
12
- # -----------------------------
13
- # PDF Text Loader
14
- # -----------------------------
15
- def load_pdf_text(file_obj):
16
- doc = fitz.open(stream=file_obj.read(), filetype="pdf")
 
17
  text = ""
18
- for page in doc:
19
- text += page.get_text()
20
- if not text.strip():
21
- raise ValueError("No text found in PDF.")
 
 
 
 
 
 
22
  return text
23
 
24
- # -----------------------------
25
- # Chunk Text
26
- # -----------------------------
27
- def chunk_text(text, max_tokens=200):
28
- sentences = re.split(r'(?<=[.!?]) +', text)
29
- chunks, current_chunk = [], []
30
- current_len = 0
31
- for sentence in sentences:
32
- word_count = len(sentence.split())
33
- if current_len + word_count > max_tokens:
34
- chunks.append(" ".join(current_chunk))
35
- current_chunk = [sentence]
36
- current_len = word_count
37
- else:
38
- current_chunk.append(sentence)
39
- current_len += word_count
40
- if current_chunk:
41
- chunks.append(" ".join(current_chunk))
42
- return chunks
43
-
44
- # -----------------------------
45
- # Simple Vector Store
46
- # -----------------------------
47
- class SimpleVectorStore:
48
- def __init__(self, dim):
49
- self.dim = dim
50
- self.vectors = []
51
- self.metadata = []
52
- self.index = None
53
-
54
- def add(self, vectors, metas):
55
- for v, m in zip(vectors, metas):
56
- vec = np.array(v, dtype=np.float32)
57
- self.vectors.append(vec)
58
- self.metadata.append(m)
59
- if self.vectors:
60
- self.index = faiss.IndexFlatL2(self.dim)
61
- self.index.add(np.stack(self.vectors))
62
-
63
- def search(self, query_vector, k=5):
64
- query_vector = np.array(query_vector, dtype=np.float32).reshape(1, -1)
65
- D, I = self.index.search(query_vector, k)
66
- results = [self.metadata[i] for i in I[0]]
67
- return results
68
-
69
- # -----------------------------
70
- # Index PDF
71
- # -----------------------------
72
- def index_pdf(file_obj):
73
- text = load_pdf_text(file_obj)
74
- chunks = chunk_text(text)
75
- embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
76
- vectors = embed_model.encode(chunks)
77
- store = SimpleVectorStore(dim=vectors.shape[1])
78
- store.add(vectors, chunks)
79
- return embed_model, store
80
-
81
- # -----------------------------
82
- # Load LLaMA Model
83
- # -----------------------------
84
- def load_llm():
85
- model_id = "meta-llama/Llama-3.2-3b-instruct"
86
- hf_token = os.getenv("HF_TOKEN")
87
- if not hf_token:
88
- raise ValueError("HF_TOKEN is not set. Please add it in Hugging Face Secrets.")
89
- login(hf_token)
90
-
91
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
92
- llm = AutoModelForCausalLM.from_pretrained(
93
- model_id,
94
- device_map="auto",
95
- torch_dtype=torch.float16,
96
- token=hf_token
97
- )
98
- return tokenizer, llm
99
-
100
- # -----------------------------
101
- # HyDE + Answer Query
102
- # -----------------------------
103
- def answer_query(file_obj, question):
104
- try:
105
- embed_model, store = index_pdf(file_obj)
106
- tokenizer, llm = load_llm()
107
-
108
- # ---- Step 1: HyDE hypothetical answer ----
109
- hyde_prompt = f"""
110
- [INST] Write a detailed hypothetical answer to this question:
111
- {question}
112
- Answer: [/INST]
113
- """
114
- inputs = tokenizer(hyde_prompt, return_tensors="pt").to(llm.device)
115
- hyde_out = llm.generate(**inputs, max_new_tokens=200)
116
- hypo_answer = tokenizer.decode(hyde_out[0], skip_special_tokens=True)
117
-
118
- # ---- Step 2: Embed hypothetical answer ----
119
- query_vec = embed_model.encode([hypo_answer])[0]
120
-
121
- # ---- Step 3: Retrieve top chunks ----
122
- relevant_chunks = store.search(query_vec, k=5)
123
- context = "\n".join(relevant_chunks)
124
-
125
- # ---- Step 4: Final Answer ----
126
- final_prompt = f"""
127
- [INST] You are a helpful tutor. Based only on the context below, answer the question.
128
- If context does not have the info, say "I could not find this in the text."
129
- Context:
130
- {context}
131
- Question: {question}
132
- Answer: [/INST]
133
- """
134
- inputs = tokenizer(final_prompt, return_tensors="pt", truncation=True).to(llm.device)
135
- outputs = llm.generate(**inputs, max_new_tokens=300, temperature=0.7, top_p=0.9, do_sample=True)
136
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
137
-
138
- if "Answer:" in answer:
139
- answer = answer.split("Answer:")[-1].strip()
140
-
141
- return answer
142
-
143
- except Exception as e:
144
- return f"⚠️ Error: {e}"
145
-
146
- # -----------------------------
147
- # Gradio UI
148
- # -----------------------------
149
  with gr.Blocks() as demo:
150
- gr.Markdown("## 📚 HyDE RAG Chatbot (PDF Tutor)")
151
- file_input = gr.File(label="Upload PDF", type="filepath")
152
- question = gr.Textbox(label="Ask a Question")
153
- answer = gr.Textbox(label="Answer", interactive=False)
154
- btn = gr.Button("Get Answer")
155
-
156
- btn.click(fn=answer_query, inputs=[file_input, question], outputs=answer)
157
-
158
- demo.launch()
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF for PDFs
3
+ import docx
 
4
  import faiss
5
  import os
6
  from sentence_transformers import SentenceTransformer
7
+ from transformers import pipeline
 
 
8
 
9
+ # 1. Load embedding + QA model
10
+ embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
11
+ qa_model = pipeline("text-generation", model="gpt2") # Replace with a better model if GPU is available
12
+
13
+ # 2. Helper: extract text from files
14
+ def extract_text(file):
15
  text = ""
16
+ if file.name.endswith(".pdf"):
17
+ doc = fitz.open(file.name)
18
+ for page in doc:
19
+ text += page.get_text("text")
20
+ elif file.name.endswith(".docx"):
21
+ doc = docx.Document(file.name)
22
+ for para in doc.paragraphs:
23
+ text += para.text + "\n"
24
+ else: # fallback: txt
25
+ text = file.read().decode("utf-8", errors="ignore")
26
  return text
27
 
28
+ # 3. Helper: create FAISS index
29
+ def build_faiss(text, chunk_size=500, overlap=50):
30
+ # Split text into chunks with overlap
31
+ chunks = []
32
+ for i in range(0, len(text), chunk_size - overlap):
33
+ chunks.append(text[i:i + chunk_size])
34
+
35
+ # Embed chunks
36
+ embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
37
+
38
+ # Store in FAISS
39
+ index = faiss.IndexFlatL2(embeddings.shape[1])
40
+ index.add(embeddings)
41
+
42
+ return index, chunks
43
+
44
+ # Global storage
45
+ doc_index = None
46
+ doc_chunks = None
47
+
48
+ # 4. Process uploaded file
49
+ def upload_file(file):
50
+ global doc_index, doc_chunks
51
+ text = extract_text(file)
52
+ doc_index, doc_chunks = build_faiss(text)
53
+ return "✅ Document indexed with HyDE! You can now ask questions."
54
+
55
+ # 5. HyDE RAG answering
56
+ def answer_query(query):
57
+ global doc_index, doc_chunks
58
+ if doc_index is None:
59
+ return "⚠️ Please upload a document first."
60
+
61
+ # Step 1: Generate hypothetical answer (HyDE step)
62
+ hyde_prompt = f"Write a detailed, hypothetical answer to the question:\n\nQuestion: {query}\nAnswer:"
63
+ hypo_answer = qa_model(hyde_prompt, max_length=150, num_return_sequences=1)[0]["generated_text"]
64
+
65
+ # Step 2: Embed the hypothetical answer instead of the raw query
66
+ q_emb = embedding_model.encode([hypo_answer], convert_to_numpy=True)
67
+
68
+ # Step 3: Retrieve top 3 most relevant chunks
69
+ D, I = doc_index.search(q_emb, k=3)
70
+ retrieved = [doc_chunks[i] for i in I[0]]
71
+
72
+ # Step 4: Build final prompt with context
73
+ context = "\n\n".join(retrieved)
74
+ final_prompt = f"Answer the question based on the context:\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"
75
+
76
+ # Step 5: Generate final response
77
+ response = qa_model(final_prompt, max_length=200, num_return_sequences=1)[0]["generated_text"]
78
+ return response
79
+
80
+ # 6. Gradio UI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  with gr.Blocks() as demo:
82
+ gr.Markdown("## 📚 HyDE RAG Chatbot (Chat with Any Document)")
83
+
84
+ with gr.Row():
85
+ file_input = gr.File(label="Upload Document", type="filepath")
86
+ upload_btn = gr.Button("Index Document")
87
+
88
+ status = gr.Textbox(label="Status")
89
+ query = gr.Textbox(label="Ask a Question")
90
+ answer = gr.Textbox(label="Answer")
91
+ ask_btn = gr.Button("Get Answer")
92
+
93
+ upload_btn.click(upload_file, inputs=file_input, outputs=status)
94
+ ask_btn.click(answer_query, inputs=query, outputs=answer)
95
+
96
+ demo.launch()