Girinath11 commited on
Commit
f4a3eb8
Β·
verified Β·
1 Parent(s): d13318b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +377 -0
app.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DocVision AI - Multimodal RAG System
3
+ Smart Document & Image Question Answering with Text Extraction
4
+ """
5
+
6
+ import gradio as gr
7
+ import os
8
+ from pathlib import Path
9
+ import json
10
+ import tempfile
11
+ from PIL import Image
12
+ import PyPDF2
13
+ import docx
14
+ from sentence_transformers import SentenceTransformer
15
+ import faiss
16
+ import numpy as np
17
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
18
+ import torch
19
+
20
+ # Initialize models
21
+ print("Loading models...")
22
+ embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
23
+
24
+ # Using a free LLM from Hugging Face
25
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
26
+ llm_model = AutoModelForCausalLM.from_pretrained(
27
+ "microsoft/phi-2",
28
+ torch_dtype=torch.float32,
29
+ trust_remote_code=True,
30
+ device_map="auto"
31
+ )
32
+
33
+ # Global storage
34
+ document_store = {
35
+ 'texts': [],
36
+ 'images': [],
37
+ 'metadata': [],
38
+ 'embeddings': None,
39
+ 'index': None
40
+ }
41
+
42
+ def extract_text_from_pdf(pdf_path):
43
+ """Extract text from PDF file"""
44
+ text_chunks = []
45
+ images = []
46
+
47
+ try:
48
+ with open(pdf_path, 'rb') as file:
49
+ pdf_reader = PyPDF2.PdfReader(file)
50
+ for page_num, page in enumerate(pdf_reader.pages):
51
+ text = page.extract_text()
52
+ if text.strip():
53
+ text_chunks.append({
54
+ 'content': text,
55
+ 'page': page_num + 1,
56
+ 'type': 'text'
57
+ })
58
+ except Exception as e:
59
+ print(f"Error extracting PDF: {e}")
60
+
61
+ return text_chunks, images
62
+
63
+ def extract_text_from_docx(docx_path):
64
+ """Extract text from DOCX file"""
65
+ text_chunks = []
66
+ try:
67
+ doc = docx.Document(docx_path)
68
+ full_text = []
69
+ for para in doc.paragraphs:
70
+ if para.text.strip():
71
+ full_text.append(para.text)
72
+
73
+ text_chunks.append({
74
+ 'content': '\n'.join(full_text),
75
+ 'type': 'text'
76
+ })
77
+ except Exception as e:
78
+ print(f"Error extracting DOCX: {e}")
79
+
80
+ return text_chunks
81
+
82
+ def extract_text_from_txt(txt_path):
83
+ """Extract text from TXT file"""
84
+ try:
85
+ with open(txt_path, 'r', encoding='utf-8') as file:
86
+ content = file.read()
87
+ return [{
88
+ 'content': content,
89
+ 'type': 'text'
90
+ }]
91
+ except Exception as e:
92
+ print(f"Error extracting TXT: {e}")
93
+ return []
94
+
95
+ def process_image(image_path):
96
+ """Process and store image"""
97
+ try:
98
+ img = Image.open(image_path)
99
+ return {
100
+ 'path': image_path,
101
+ 'type': 'image'
102
+ }
103
+ except Exception as e:
104
+ print(f"Error processing image: {e}")
105
+ return None
106
+
107
+ def chunk_text(text, chunk_size=500):
108
+ """Split text into smaller chunks"""
109
+ words = text.split()
110
+ chunks = []
111
+ for i in range(0, len(words), chunk_size):
112
+ chunk = ' '.join(words[i:i + chunk_size])
113
+ chunks.append(chunk)
114
+ return chunks
115
+
116
+ def process_documents(files):
117
+ """Process uploaded documents"""
118
+ global document_store
119
+
120
+ if not files:
121
+ return "No files uploaded!"
122
+
123
+ # Reset document store
124
+ document_store = {
125
+ 'texts': [],
126
+ 'images': [],
127
+ 'metadata': [],
128
+ 'embeddings': None,
129
+ 'index': None
130
+ }
131
+
132
+ total_texts = 0
133
+ total_images = 0
134
+
135
+ for file in files:
136
+ file_path = file.name
137
+ file_ext = Path(file_path).suffix.lower()
138
+
139
+ if file_ext == '.pdf':
140
+ text_chunks, images = extract_text_from_pdf(file_path)
141
+ for chunk in text_chunks:
142
+ # Split into smaller chunks
143
+ small_chunks = chunk_text(chunk['content'])
144
+ for sc in small_chunks:
145
+ document_store['texts'].append(sc)
146
+ document_store['metadata'].append({
147
+ 'source': Path(file_path).name,
148
+ 'page': chunk.get('page', 'N/A'),
149
+ 'type': 'text'
150
+ })
151
+ total_texts += 1
152
+
153
+ elif file_ext == '.docx':
154
+ text_chunks = extract_text_from_docx(file_path)
155
+ for chunk in text_chunks:
156
+ small_chunks = chunk_text(chunk['content'])
157
+ for sc in small_chunks:
158
+ document_store['texts'].append(sc)
159
+ document_store['metadata'].append({
160
+ 'source': Path(file_path).name,
161
+ 'type': 'text'
162
+ })
163
+ total_texts += 1
164
+
165
+ elif file_ext == '.txt':
166
+ text_chunks = extract_text_from_txt(file_path)
167
+ for chunk in text_chunks:
168
+ small_chunks = chunk_text(chunk['content'])
169
+ for sc in small_chunks:
170
+ document_store['texts'].append(sc)
171
+ document_store['metadata'].append({
172
+ 'source': Path(file_path).name,
173
+ 'type': 'text'
174
+ })
175
+ total_texts += 1
176
+
177
+ elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
178
+ img_data = process_image(file_path)
179
+ if img_data:
180
+ document_store['images'].append(img_data)
181
+ total_images += 1
182
+
183
+ # Create embeddings and index
184
+ if document_store['texts']:
185
+ embeddings = embedding_model.encode(document_store['texts'])
186
+ document_store['embeddings'] = embeddings
187
+
188
+ # Create FAISS index
189
+ dimension = embeddings.shape[1]
190
+ index = faiss.IndexFlatL2(dimension)
191
+ index.add(embeddings.astype('float32'))
192
+ document_store['index'] = index
193
+
194
+ return f"βœ… Documents processed successfully!\nπŸ“„ Text chunks: {total_texts}\nπŸ–ΌοΈ Images: {total_images}"
195
+
196
+ def retrieve_relevant_context(query, top_k=3):
197
+ """Retrieve relevant text chunks for the query"""
198
+ if not document_store['texts'] or document_store['index'] is None:
199
+ return []
200
+
201
+ query_embedding = embedding_model.encode([query])
202
+ distances, indices = document_store['index'].search(query_embedding.astype('float32'), top_k)
203
+
204
+ relevant_chunks = []
205
+ for idx in indices[0]:
206
+ if idx < len(document_store['texts']):
207
+ relevant_chunks.append({
208
+ 'text': document_store['texts'][idx],
209
+ 'metadata': document_store['metadata'][idx]
210
+ })
211
+
212
+ return relevant_chunks
213
+
214
+ def generate_answer(query, context_chunks):
215
+ """Generate answer using LLM"""
216
+ # Prepare context
217
+ context = "\n\n".join([chunk['text'] for chunk in context_chunks])
218
+
219
+ # Create prompt
220
+ prompt = f"""Based on the following context, answer the question accurately and concisely.
221
+
222
+ Context:
223
+ {context}
224
+
225
+ Question: {query}
226
+
227
+ Answer:"""
228
+
229
+ # Generate response
230
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
231
+
232
+ with torch.no_grad():
233
+ outputs = llm_model.generate(
234
+ inputs.input_ids,
235
+ max_new_tokens=300,
236
+ temperature=0.7,
237
+ do_sample=True,
238
+ top_p=0.9,
239
+ pad_token_id=tokenizer.eos_token_id
240
+ )
241
+
242
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
243
+
244
+ # Extract only the answer part
245
+ if "Answer:" in answer:
246
+ answer = answer.split("Answer:")[-1].strip()
247
+
248
+ return answer
249
+
250
+ def find_relevant_images(query):
251
+ """Find relevant images based on query keywords"""
252
+ if not document_store['images']:
253
+ return []
254
+
255
+ # Simple keyword matching for images
256
+ # You can enhance this with image captioning models
257
+ return document_store['images'][:2] # Return first 2 images for now
258
+
259
+ def answer_question(query):
260
+ """Main function to answer questions"""
261
+ if not query.strip():
262
+ return "Please enter a question!", None
263
+
264
+ if not document_store['texts']:
265
+ return "Please upload documents first!", None
266
+
267
+ # Retrieve relevant context
268
+ relevant_chunks = retrieve_relevant_context(query, top_k=3)
269
+
270
+ if not relevant_chunks:
271
+ return "No relevant information found in the documents.", None
272
+
273
+ # Generate answer
274
+ answer = generate_answer(query, relevant_chunks)
275
+
276
+ # Find relevant images
277
+ relevant_images = find_relevant_images(query)
278
+
279
+ # Prepare response
280
+ response = f"**Answer:**\n{answer}\n\n"
281
+ response += f"\n**Sources:**\n"
282
+ for i, chunk in enumerate(relevant_chunks, 1):
283
+ source = chunk['metadata'].get('source', 'Unknown')
284
+ page = chunk['metadata'].get('page', '')
285
+ if page:
286
+ response += f"{i}. {source} (Page {page})\n"
287
+ else:
288
+ response += f"{i}. {source}\n"
289
+
290
+ # Return images if available
291
+ image_outputs = None
292
+ if relevant_images:
293
+ image_outputs = [img['path'] for img in relevant_images]
294
+
295
+ return response, image_outputs
296
+
297
+ # Create Gradio interface
298
+ with gr.Blocks(title="πŸ“š DocVision AI - Multimodal RAG", theme=gr.themes.Soft()) as demo:
299
+ gr.Markdown("""
300
+ # πŸ“š DocVision AI
301
+ ### *Smart Document & Image Question Answering with Multimodal RAG*
302
+
303
+ Extract text from documents, upload images, and ask intelligent questions!
304
+
305
+ **How to use:**
306
+ 1. πŸ“€ **Upload** your documents (PDF, DOCX, TXT) and images (JPG, PNG)
307
+ 2. ⚑ **Process** to extract and index content
308
+ 3. πŸ’¬ **Ask** questions and get accurate answers with relevant images!
309
+ """)
310
+
311
+ with gr.Row():
312
+ with gr.Column(scale=1):
313
+ file_upload = gr.File(
314
+ label="πŸ“ Upload Documents & Images",
315
+ file_count="multiple",
316
+ file_types=[".pdf", ".docx", ".txt", ".jpg", ".jpeg", ".png"]
317
+ )
318
+ process_btn = gr.Button("⚑ Process Documents", variant="primary", size="lg")
319
+ status_output = gr.Textbox(label="πŸ“Š Processing Status", lines=3)
320
+
321
+ with gr.Column(scale=1):
322
+ gr.Markdown("### πŸ’¬ Ask Your Questions")
323
+ question_input = gr.Textbox(
324
+ label="Your Question",
325
+ placeholder="What would you like to know about your documents?",
326
+ lines=3
327
+ )
328
+ ask_btn = gr.Button("πŸ” Get Answer", variant="primary", size="lg")
329
+
330
+ with gr.Row():
331
+ answer_output = gr.Markdown(label="πŸ“ Answer & Sources")
332
+
333
+ with gr.Row():
334
+ image_output = gr.Gallery(
335
+ label="πŸ–ΌοΈ Relevant Images from Documents",
336
+ columns=2,
337
+ height="auto"
338
+ )
339
+
340
+ # Example questions
341
+ gr.Markdown("### πŸ“Œ Try These Example Questions:")
342
+ gr.Examples(
343
+ examples=[
344
+ ["What is the main topic of this document?"],
345
+ ["Summarize the key points mentioned"],
346
+ ["What are the important dates or numbers mentioned?"],
347
+ ["List the main findings or conclusions"],
348
+ ],
349
+ inputs=question_input
350
+ )
351
+
352
+ gr.Markdown("""
353
+ ---
354
+ **Powered by:** πŸ€— Hugging Face | Microsoft Phi-2 | Sentence Transformers | FAISS
355
+ """)
356
+
357
+ # Event handlers
358
+ process_btn.click(
359
+ fn=process_documents,
360
+ inputs=[file_upload],
361
+ outputs=[status_output]
362
+ )
363
+
364
+ ask_btn.click(
365
+ fn=answer_question,
366
+ inputs=[question_input],
367
+ outputs=[answer_output, image_output]
368
+ )
369
+
370
+ question_input.submit(
371
+ fn=answer_question,
372
+ inputs=[question_input],
373
+ outputs=[answer_output, image_output]
374
+ )
375
+
376
+ if __name__ == "__main__":
377
+ demo.launch()