Girinath11 commited on
Commit
e789621
·
verified ·
1 Parent(s): 1fbb18c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -288
app.py CHANGED
@@ -1,377 +1,210 @@
1
- """
2
- DocVision AI - Multimodal RAG System
3
- Smart Document & Image Question Answering with Text Extraction
4
- """
5
-
6
  import gradio as gr
7
- import os
8
  from pathlib import Path
9
- import json
10
- import tempfile
11
  from PIL import Image
12
  import PyPDF2
13
  import docx
14
  from sentence_transformers import SentenceTransformer
15
  import faiss
16
  import numpy as np
17
- from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
18
  import torch
 
19
 
20
- # Initialize models
21
  print("Loading models...")
22
  embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
23
-
24
- # Using a free LLM from Hugging Face
25
- tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
26
  llm_model = AutoModelForCausalLM.from_pretrained(
27
- "microsoft/phi-2",
28
- torch_dtype=torch.float32,
29
- trust_remote_code=True,
30
  device_map="auto"
31
  )
32
 
33
- # Global storage
34
- document_store = {
35
- 'texts': [],
36
- 'images': [],
37
- 'metadata': [],
38
- 'embeddings': None,
39
- 'index': None
40
- }
41
-
42
- def extract_text_from_pdf(pdf_path):
43
- """Extract text from PDF file"""
44
- text_chunks = []
45
- images = []
46
-
47
- try:
48
- with open(pdf_path, 'rb') as file:
49
- pdf_reader = PyPDF2.PdfReader(file)
50
- for page_num, page in enumerate(pdf_reader.pages):
51
- text = page.extract_text()
52
- if text.strip():
53
- text_chunks.append({
54
- 'content': text,
55
- 'page': page_num + 1,
56
- 'type': 'text'
57
- })
58
- except Exception as e:
59
- print(f"Error extracting PDF: {e}")
60
-
61
- return text_chunks, images
62
 
63
- def extract_text_from_docx(docx_path):
64
- """Extract text from DOCX file"""
65
- text_chunks = []
66
- try:
67
- doc = docx.Document(docx_path)
68
- full_text = []
69
- for para in doc.paragraphs:
70
- if para.text.strip():
71
- full_text.append(para.text)
72
-
73
- text_chunks.append({
74
- 'content': '\n'.join(full_text),
75
- 'type': 'text'
76
- })
77
- except Exception as e:
78
- print(f"Error extracting DOCX: {e}")
79
-
80
- return text_chunks
81
 
82
- def extract_text_from_txt(txt_path):
83
- """Extract text from TXT file"""
84
- try:
85
- with open(txt_path, 'r', encoding='utf-8') as file:
86
- content = file.read()
87
- return [{
88
- 'content': content,
89
- 'type': 'text'
90
- }]
91
- except Exception as e:
92
- print(f"Error extracting TXT: {e}")
93
- return []
94
 
95
- def process_image(image_path):
96
- """Process and store image"""
97
- try:
98
- img = Image.open(image_path)
99
- return {
100
- 'path': image_path,
101
- 'type': 'image'
102
- }
103
- except Exception as e:
104
- print(f"Error processing image: {e}")
105
- return None
106
 
107
- def chunk_text(text, chunk_size=500):
108
- """Split text into smaller chunks"""
109
  words = text.split()
110
  chunks = []
111
- for i in range(0, len(words), chunk_size):
112
- chunk = ' '.join(words[i:i + chunk_size])
113
- chunks.append(chunk)
114
  return chunks
115
 
116
- def process_documents(files):
117
- """Process uploaded documents"""
118
- global document_store
119
 
120
  if not files:
121
- return "No files uploaded!"
122
-
123
- # Reset document store
124
- document_store = {
125
- 'texts': [],
126
- 'images': [],
127
- 'metadata': [],
128
- 'embeddings': None,
129
- 'index': None
130
- }
131
 
132
- total_texts = 0
133
- total_images = 0
134
 
135
  for file in files:
136
- file_path = file.name
137
- file_ext = Path(file_path).suffix.lower()
138
 
139
- if file_ext == '.pdf':
140
- text_chunks, images = extract_text_from_pdf(file_path)
141
- for chunk in text_chunks:
142
- # Split into smaller chunks
143
- small_chunks = chunk_text(chunk['content'])
144
- for sc in small_chunks:
145
- document_store['texts'].append(sc)
146
- document_store['metadata'].append({
147
- 'source': Path(file_path).name,
148
- 'page': chunk.get('page', 'N/A'),
149
- 'type': 'text'
150
- })
151
- total_texts += 1
152
-
153
- elif file_ext == '.docx':
154
- text_chunks = extract_text_from_docx(file_path)
155
- for chunk in text_chunks:
156
- small_chunks = chunk_text(chunk['content'])
157
- for sc in small_chunks:
158
- document_store['texts'].append(sc)
159
- document_store['metadata'].append({
160
- 'source': Path(file_path).name,
161
- 'type': 'text'
162
  })
163
- total_texts += 1
164
-
165
- elif file_ext == '.txt':
166
- text_chunks = extract_text_from_txt(file_path)
167
- for chunk in text_chunks:
168
- small_chunks = chunk_text(chunk['content'])
169
- for sc in small_chunks:
170
- document_store['texts'].append(sc)
171
- document_store['metadata'].append({
172
- 'source': Path(file_path).name,
173
- 'type': 'text'
174
- })
175
- total_texts += 1
176
-
177
- elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
178
- img_data = process_image(file_path)
179
- if img_data:
180
- document_store['images'].append(img_data)
181
- total_images += 1
182
 
183
- # Create embeddings and index
184
- if document_store['texts']:
185
- embeddings = embedding_model.encode(document_store['texts'])
186
- document_store['embeddings'] = embeddings
187
 
188
- # Create FAISS index
189
- dimension = embeddings.shape[1]
190
- index = faiss.IndexFlatL2(dimension)
191
  index.add(embeddings.astype('float32'))
192
- document_store['index'] = index
193
 
194
- return f" Documents processed successfully!\n📄 Text chunks: {total_texts}\n🖼️ Images: {total_images}"
195
 
196
- def retrieve_relevant_context(query, top_k=3):
197
- """Retrieve relevant text chunks for the query"""
198
- if not document_store['texts'] or document_store['index'] is None:
199
  return []
200
 
201
- query_embedding = embedding_model.encode([query])
202
- distances, indices = document_store['index'].search(query_embedding.astype('float32'), top_k)
203
 
204
- relevant_chunks = []
205
  for idx in indices[0]:
206
- if idx < len(document_store['texts']):
207
- relevant_chunks.append({
208
- 'text': document_store['texts'][idx],
209
- 'metadata': document_store['metadata'][idx]
210
- })
211
 
212
- return relevant_chunks
213
 
214
- def generate_answer(query, context_chunks):
215
- """Generate answer using LLM"""
216
- # Prepare context
217
- context = "\n\n".join([chunk['text'] for chunk in context_chunks])
218
 
219
- # Create prompt
220
- prompt = f"""Based on the following context, answer the question accurately and concisely.
221
 
222
- Context:
223
  {context}
224
 
225
- Question: {query}
226
-
227
  Answer:"""
228
 
229
- # Generate response
230
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
231
 
232
  with torch.no_grad():
233
  outputs = llm_model.generate(
234
  inputs.input_ids,
235
- max_new_tokens=300,
236
  temperature=0.7,
237
- do_sample=True,
238
  top_p=0.9,
239
  pad_token_id=tokenizer.eos_token_id
240
  )
241
 
242
  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
243
-
244
- # Extract only the answer part
245
- if "Answer:" in answer:
246
- answer = answer.split("Answer:")[-1].strip()
247
 
248
  return answer
249
 
250
- def find_relevant_images(query):
251
- """Find relevant images based on query keywords"""
252
- if not document_store['images']:
253
- return []
254
 
255
- # Simple keyword matching for images
256
- # You can enhance this with image captioning models
257
- return document_store['images'][:2] # Return first 2 images for now
258
-
259
- def answer_question(query):
260
- """Main function to answer questions"""
261
- if not query.strip():
262
- return "Please enter a question!", None
263
 
264
- if not document_store['texts']:
265
- return "Please upload documents first!", None
266
 
267
- # Retrieve relevant context
268
- relevant_chunks = retrieve_relevant_context(query, top_k=3)
269
-
270
- if not relevant_chunks:
271
- return "No relevant information found in the documents.", None
272
 
273
  # Generate answer
274
- answer = generate_answer(query, relevant_chunks)
275
-
276
- # Find relevant images
277
- relevant_images = find_relevant_images(query)
278
 
279
- # Prepare response
280
- response = f"**Answer:**\n{answer}\n\n"
281
- response += f"\n**Sources:**\n"
282
- for i, chunk in enumerate(relevant_chunks, 1):
283
- source = chunk['metadata'].get('source', 'Unknown')
284
- page = chunk['metadata'].get('page', '')
285
  if page:
286
  response += f"{i}. {source} (Page {page})\n"
287
  else:
288
  response += f"{i}. {source}\n"
289
 
290
  # Return images if available
291
- image_outputs = None
292
- if relevant_images:
293
- image_outputs = [img['path'] for img in relevant_images]
294
 
295
- return response, image_outputs
296
 
297
- # Create Gradio interface
298
- with gr.Blocks(title="📚 DocVision AI - Multimodal RAG", theme=gr.themes.Soft()) as demo:
299
- gr.Markdown("""
300
- # 📚 DocVision AI
301
- ### *Smart Document & Image Question Answering with Multimodal RAG*
302
-
303
- Extract text from documents, upload images, and ask intelligent questions!
304
-
305
- **How to use:**
306
- 1. 📤 **Upload** your documents (PDF, DOCX, TXT) and images (JPG, PNG)
307
- 2. ⚡ **Process** to extract and index content
308
- 3. 💬 **Ask** questions and get accurate answers with relevant images!
309
- """)
310
 
311
  with gr.Row():
312
- with gr.Column(scale=1):
313
- file_upload = gr.File(
314
- label="📁 Upload Documents & Images",
315
  file_count="multiple",
316
- file_types=[".pdf", ".docx", ".txt", ".jpg", ".jpeg", ".png"]
317
- )
318
- process_btn = gr.Button("⚡ Process Documents", variant="primary", size="lg")
319
- status_output = gr.Textbox(label="📊 Processing Status", lines=3)
320
-
321
- with gr.Column(scale=1):
322
- gr.Markdown("### 💬 Ask Your Questions")
323
- question_input = gr.Textbox(
324
- label="Your Question",
325
- placeholder="What would you like to know about your documents?",
326
- lines=3
327
  )
328
- ask_btn = gr.Button("🔍 Get Answer", variant="primary", size="lg")
329
-
330
- with gr.Row():
331
- answer_output = gr.Markdown(label="📝 Answer & Sources")
 
 
332
 
333
- with gr.Row():
334
- image_output = gr.Gallery(
335
- label="🖼️ Relevant Images from Documents",
336
- columns=2,
337
- height="auto"
338
- )
339
 
340
- # Example questions
341
- gr.Markdown("### 📌 Try These Example Questions:")
342
  gr.Examples(
343
  examples=[
344
- ["What is the main topic of this document?"],
345
- ["Summarize the key points mentioned"],
346
- ["What are the important dates or numbers mentioned?"],
347
- ["List the main findings or conclusions"],
348
  ],
349
- inputs=question_input
350
  )
351
 
352
- gr.Markdown("""
353
- ---
354
- **Powered by:** 🤗 Hugging Face | Microsoft Phi-2 | Sentence Transformers | FAISS
355
- """)
356
-
357
- # Event handlers
358
- process_btn.click(
359
- fn=process_documents,
360
- inputs=[file_upload],
361
- outputs=[status_output]
362
- )
363
-
364
- ask_btn.click(
365
- fn=answer_question,
366
- inputs=[question_input],
367
- outputs=[answer_output, image_output]
368
- )
369
-
370
- question_input.submit(
371
- fn=answer_question,
372
- inputs=[question_input],
373
- outputs=[answer_output, image_output]
374
- )
375
 
376
  if __name__ == "__main__":
377
- demo.launch()
 
 
 
 
 
 
1
  import gradio as gr
 
2
  from pathlib import Path
 
 
3
  from PIL import Image
4
  import PyPDF2
5
  import docx
6
  from sentence_transformers import SentenceTransformer
7
  import faiss
8
  import numpy as np
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM
10
  import torch
11
+ from datetime import datetime
12
 
13
+ # Load models
14
  print("Loading models...")
15
  embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
16
+ tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 
 
17
  llm_model = AutoModelForCausalLM.from_pretrained(
18
+ "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
19
+ torch_dtype=torch.float16,
 
20
  device_map="auto"
21
  )
22
 
23
+ # Store documents
24
+ documents = []
25
+ images = []
26
+ embeddings_index = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ def extract_pdf_text(pdf_path):
29
+ chunks = []
30
+ with open(pdf_path, 'rb') as f:
31
+ pdf = PyPDF2.PdfReader(f)
32
+ for i, page in enumerate(pdf.pages):
33
+ text = page.extract_text()
34
+ if text.strip():
35
+ chunks.append({'text': text, 'page': i+1, 'source': Path(pdf_path).name})
36
+ return chunks
 
 
 
 
 
 
 
 
 
37
 
38
+ def extract_docx_text(docx_path):
39
+ doc = docx.Document(docx_path)
40
+ text = '\n'.join([p.text for p in doc.paragraphs if p.text.strip()])
41
+ return [{'text': text, 'source': Path(docx_path).name}]
 
 
 
 
 
 
 
 
42
 
43
+ def extract_txt_text(txt_path):
44
+ with open(txt_path, 'r', encoding='utf-8') as f:
45
+ text = f.read()
46
+ return [{'text': text, 'source': Path(txt_path).name}]
 
 
 
 
 
 
 
47
 
48
+ def chunk_text(text, size=400):
 
49
  words = text.split()
50
  chunks = []
51
+ for i in range(0, len(words), size):
52
+ chunks.append(' '.join(words[i:i+size]))
 
53
  return chunks
54
 
55
+ def process_files(files):
56
+ global documents, images, embeddings_index
 
57
 
58
  if not files:
59
+ return "Please upload files first"
 
 
 
 
 
 
 
 
 
60
 
61
+ documents = []
62
+ images = []
63
 
64
  for file in files:
65
+ ext = Path(file.name).suffix.lower()
 
66
 
67
+ if ext == '.pdf':
68
+ chunks = extract_pdf_text(file.name)
69
+ for chunk in chunks:
70
+ for small_chunk in chunk_text(chunk['text']):
71
+ documents.append({
72
+ 'text': small_chunk,
73
+ 'source': chunk['source'],
74
+ 'page': chunk.get('page', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  })
76
+
77
+ elif ext == '.docx':
78
+ chunks = extract_docx_text(file.name)
79
+ for chunk in chunks:
80
+ for small_chunk in chunk_text(chunk['text']):
81
+ documents.append({'text': small_chunk, 'source': chunk['source']})
82
+
83
+ elif ext == '.txt':
84
+ chunks = extract_txt_text(file.name)
85
+ for chunk in chunks:
86
+ for small_chunk in chunk_text(chunk['text']):
87
+ documents.append({'text': small_chunk, 'source': chunk['source']})
88
+
89
+ elif ext in ['.jpg', '.jpeg', '.png']:
90
+ images.append(file.name)
 
 
 
 
91
 
92
+ # Create embeddings
93
+ if documents:
94
+ texts = [doc['text'] for doc in documents]
95
+ embeddings = embedding_model.encode(texts)
96
 
97
+ index = faiss.IndexFlatL2(embeddings.shape[1])
 
 
98
  index.add(embeddings.astype('float32'))
99
+ embeddings_index = index
100
 
101
+ return f"Processed {len(documents)} text chunks and {len(images)} images"
102
 
103
+ def search_documents(query, k=3):
104
+ if not documents or embeddings_index is None:
 
105
  return []
106
 
107
+ query_vec = embedding_model.encode([query])
108
+ distances, indices = embeddings_index.search(query_vec.astype('float32'), k)
109
 
110
+ results = []
111
  for idx in indices[0]:
112
+ if idx < len(documents):
113
+ results.append(documents[idx])
 
 
 
114
 
115
+ return results
116
 
117
+ def generate_answer(question, context_docs):
118
+ context = '\n\n'.join([doc['text'] for doc in context_docs])
 
 
119
 
120
+ prompt = f"""Answer the question based on this context:
 
121
 
 
122
  {context}
123
 
124
+ Question: {question}
 
125
  Answer:"""
126
 
 
127
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
128
 
129
  with torch.no_grad():
130
  outputs = llm_model.generate(
131
  inputs.input_ids,
132
+ max_new_tokens=250,
133
  temperature=0.7,
 
134
  top_p=0.9,
135
  pad_token_id=tokenizer.eos_token_id
136
  )
137
 
138
  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
139
+ answer = answer.split("Answer:")[-1].strip()
 
 
 
140
 
141
  return answer
142
 
143
+ def answer_query(question):
144
+ if not question:
145
+ return "Please enter a question", None
 
146
 
147
+ if not documents:
148
+ return "Please upload documents first", None
 
 
 
 
 
 
149
 
150
+ # Search relevant docs
151
+ relevant_docs = search_documents(question)
152
 
153
+ if not relevant_docs:
154
+ return "No relevant info found", None
 
 
 
155
 
156
  # Generate answer
157
+ answer = generate_answer(question, relevant_docs)
 
 
 
158
 
159
+ # Format response
160
+ response = f"**Answer:**\n{answer}\n\n**Sources:**\n"
161
+ for i, doc in enumerate(relevant_docs, 1):
162
+ source = doc['source']
163
+ page = doc.get('page', '')
 
164
  if page:
165
  response += f"{i}. {source} (Page {page})\n"
166
  else:
167
  response += f"{i}. {source}\n"
168
 
169
  # Return images if available
170
+ imgs = images[:2] if images else None
 
 
171
 
172
+ return response, imgs
173
 
174
+ # UI
175
+ with gr.Blocks(title="DocVision AI") as app:
176
+ gr.Markdown("# DocVision AI - Document Q&A System")
177
+ gr.Markdown("Upload documents and ask questions to get AI-powered answers")
 
 
 
 
 
 
 
 
 
178
 
179
  with gr.Row():
180
+ with gr.Column():
181
+ file_input = gr.File(
182
+ label="Upload Files (PDF, DOCX, TXT, Images)",
183
  file_count="multiple",
184
+ file_types=[".pdf", ".docx", ".txt", ".jpg", ".png"]
 
 
 
 
 
 
 
 
 
 
185
  )
186
+ process_btn = gr.Button("Process Documents", variant="primary")
187
+ status = gr.Textbox(label="Status")
188
+
189
+ with gr.Column():
190
+ question = gr.Textbox(label="Ask a Question", lines=2)
191
+ ask_btn = gr.Button("Get Answer", variant="primary")
192
 
193
+ answer = gr.Markdown(label="Answer")
194
+ gallery = gr.Gallery(label="Related Images", columns=2)
 
 
 
 
195
 
 
 
196
  gr.Examples(
197
  examples=[
198
+ ["What is this document about?"],
199
+ ["Summarize the main points"],
200
+ ["What are the key findings?"]
 
201
  ],
202
+ inputs=question
203
  )
204
 
205
+ process_btn.click(process_files, inputs=[file_input], outputs=[status])
206
+ ask_btn.click(answer_query, inputs=[question], outputs=[answer, gallery])
207
+ question.submit(answer_query, inputs=[question], outputs=[answer, gallery])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  if __name__ == "__main__":
210
+ app.launch()