Spaces:

Girinath11
/

DocVision_AI

Running

App Files Files Community

Girinath11 commited on Jan 28

Commit

e56b39d

verified ·

1 Parent(s): e789621

Update app.py

Browse files

Files changed (1) hide show

app.py +217 -42

app.py CHANGED Viewed

@@ -6,13 +6,17 @@ import docx
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 from datetime import datetime
 # Load models
 print("Loading models...")
 embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 llm_model = AutoModelForCausalLM.from_pretrained(
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -20,51 +24,119 @@ llm_model = AutoModelForCausalLM.from_pretrained(
     device_map="auto"
 )
-# Store documents
 documents = []
 images = []
 embeddings_index = None
 def extract_pdf_text(pdf_path):
     chunks = []
     with open(pdf_path, 'rb') as f:
         pdf = PyPDF2.PdfReader(f)
         for i, page in enumerate(pdf.pages):
             text = page.extract_text()
             if text.strip():
-                chunks.append({'text': text, 'page': i+1, 'source': Path(pdf_path).name})
     return chunks
 def extract_docx_text(docx_path):
     doc = docx.Document(docx_path)
     text = '\n'.join([p.text for p in doc.paragraphs if p.text.strip()])
     return [{'text': text, 'source': Path(docx_path).name}]
 def extract_txt_text(txt_path):
     with open(txt_path, 'r', encoding='utf-8') as f:
         text = f.read()
     return [{'text': text, 'source': Path(txt_path).name}]
 def chunk_text(text, size=400):
     words = text.split()
     chunks = []
     for i in range(0, len(words), size):
         chunks.append(' '.join(words[i:i+size]))
     return chunks
-def process_files(files):
-    global documents, images, embeddings_index
     if not files:
         return "Please upload files first"
     documents = []
     images = []
-    for file in files:
         ext = Path(file.name).suffix.lower()
         if ext == '.pdf':
             chunks = extract_pdf_text(file.name)
             for chunk in chunks:
                 for small_chunk in chunk_text(chunk['text']):
@@ -73,34 +145,59 @@ def process_files(files):
                         'source': chunk['source'],
                         'page': chunk.get('page', '')
                     })
         elif ext == '.docx':
             chunks = extract_docx_text(file.name)
             for chunk in chunks:
                 for small_chunk in chunk_text(chunk['text']):
-                    documents.append({'text': small_chunk, 'source': chunk['source']})
         elif ext == '.txt':
             chunks = extract_txt_text(file.name)
             for chunk in chunks:
                 for small_chunk in chunk_text(chunk['text']):
-                    documents.append({'text': small_chunk, 'source': chunk['source']})
-        elif ext in ['.jpg', '.jpeg', '.png']:
-            images.append(file.name)
-    # Create embeddings
     if documents:
         texts = [doc['text'] for doc in documents]
-        embeddings = embedding_model.encode(texts)
         index = faiss.IndexFlatL2(embeddings.shape[1])
         index.add(embeddings.astype('float32'))
         embeddings_index = index
-    return f"Processed {len(documents)} text chunks and {len(images)} images"
 def search_documents(query, k=3):
     if not documents or embeddings_index is None:
         return []
@@ -114,24 +211,59 @@ def search_documents(query, k=3):
     return results
 def generate_answer(question, context_docs):
     context = '\n\n'.join([doc['text'] for doc in context_docs])
-    prompt = f"""Answer the question based on this context:
 {context}
 Question: {question}
 Answer:"""
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
     with torch.no_grad():
         outputs = llm_model.generate(
             inputs.input_ids,
-            max_new_tokens=250,
             temperature=0.7,
-            top_p=0.9,
             pad_token_id=tokenizer.eos_token_id
         )
@@ -140,24 +272,27 @@ Answer:"""
     return answer
-def answer_query(question):
     if not question:
-        return "Please enter a question", None
     if not documents:
-        return "Please upload documents first", None
-    # Search relevant docs
-    relevant_docs = search_documents(question)
     if not relevant_docs:
-        return "No relevant info found", None
     # Generate answer
     answer = generate_answer(question, relevant_docs)
     # Format response
-    response = f"**Answer:**\n{answer}\n\n**Sources:**\n"
     for i, doc in enumerate(relevant_docs, 1):
         source = doc['source']
         page = doc.get('page', '')
@@ -166,45 +301,85 @@ def answer_query(question):
         else:
             response += f"{i}. {source}\n"
-    # Return images if available
-    imgs = images[:2] if images else None
-    return response, imgs
 # UI
-with gr.Blocks(title="DocVision AI") as app:
-    gr.Markdown("# DocVision AI - Document Q&A System")
-    gr.Markdown("Upload documents and ask questions to get AI-powered answers")
     with gr.Row():
         with gr.Column():
             file_input = gr.File(
-                label="Upload Files (PDF, DOCX, TXT, Images)",
                 file_count="multiple",
-                file_types=[".pdf", ".docx", ".txt", ".jpg", ".png"]
             )
-            process_btn = gr.Button("Process Documents", variant="primary")
-            status = gr.Textbox(label="Status")
         with gr.Column():
-            question = gr.Textbox(label="Ask a Question", lines=2)
-            ask_btn = gr.Button("Get Answer", variant="primary")
-    answer = gr.Markdown(label="Answer")
-    gallery = gr.Gallery(label="Related Images", columns=2)
     gr.Examples(
         examples=[
             ["What is this document about?"],
             ["Summarize the main points"],
-            ["What are the key findings?"]
         ],
         inputs=question
     )
-    process_btn.click(process_files, inputs=[file_input], outputs=[status])
-    ask_btn.click(answer_query, inputs=[question], outputs=[answer, gallery])
-    question.submit(answer_query, inputs=[question], outputs=[answer, gallery])
 if __name__ == "__main__":
     app.launch()

 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
+from transformers import AutoTokenizer, AutoModelForCausalLM, BlipProcessor, BlipForConditionalGeneration
 import torch
 from datetime import datetime
+import fitz  # PyMuPDF for better PDF image extraction
+import io
 # Load models
 print("Loading models...")
 embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+print("Loading LLM...")
 tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 llm_model = AutoModelForCausalLM.from_pretrained(
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
     device_map="auto"
 )
+print("Loading image caption model...")
+caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+caption_model = BlipForConditionalGeneration.from_pretrained(
+    "Salesforce/blip-image-captioning-base",
+    torch_dtype=torch.float16
+).to("cuda" if torch.cuda.is_available() else "cpu")
+print("✅ All models loaded!")
+# Store documents and images
 documents = []
 images = []
+image_captions = []
 embeddings_index = None
+def generate_image_caption(image_path):
+    """Generate caption for image"""
+    try:
+        img = Image.open(image_path).convert('RGB')
+        inputs = caption_processor(img, return_tensors="pt").to(caption_model.device)
+        out = caption_model.generate(**inputs, max_length=50)
+        caption = caption_processor.decode(out[0], skip_special_tokens=True)
+        return caption
+    except Exception as e:
+        print(f"Caption error: {e}")
+        return "Image from document"
+def extract_images_from_pdf(pdf_path):
+    """Extract images from PDF using PyMuPDF"""
+    extracted_images = []
+    try:
+        doc = fitz.open(pdf_path)
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            image_list = page.get_images(full=True)
+            for img_index, img in enumerate(image_list):
+                xref = img[0]
+                base_image = doc.extract_image(xref)
+                image_bytes = base_image["image"]
+                # Save image
+                img_path = f"/tmp/pdf_img_p{page_num+1}_{img_index}.png"
+                with open(img_path, "wb") as f:
+                    f.write(image_bytes)
+                extracted_images.append({
+                    'path': img_path,
+                    'page': page_num + 1,
+                    'source': Path(pdf_path).name
+                })
+        doc.close()
+    except Exception as e:
+        print(f"PDF image extraction error: {e}")
+    return extracted_images
 def extract_pdf_text(pdf_path):
+    """Extract text from PDF"""
     chunks = []
     with open(pdf_path, 'rb') as f:
         pdf = PyPDF2.PdfReader(f)
         for i, page in enumerate(pdf.pages):
             text = page.extract_text()
             if text.strip():
+                chunks.append({
+                    'text': text,
+                    'page': i+1,
+                    'source': Path(pdf_path).name
+                })
     return chunks
 def extract_docx_text(docx_path):
+    """Extract text from DOCX"""
     doc = docx.Document(docx_path)
     text = '\n'.join([p.text for p in doc.paragraphs if p.text.strip()])
     return [{'text': text, 'source': Path(docx_path).name}]
 def extract_txt_text(txt_path):
+    """Extract text from TXT"""
     with open(txt_path, 'r', encoding='utf-8') as f:
         text = f.read()
     return [{'text': text, 'source': Path(txt_path).name}]
 def chunk_text(text, size=400):
+    """Split text into chunks"""
     words = text.split()
     chunks = []
     for i in range(0, len(words), size):
         chunks.append(' '.join(words[i:i+size]))
     return chunks
+def process_files(files, progress=gr.Progress()):
+    """Process files with progress tracking"""
+    global documents, images, image_captions, embeddings_index
     if not files:
         return "Please upload files first"
     documents = []
     images = []
+    image_captions = []
+    total = len(files)
+    for idx, file in enumerate(files):
+        progress((idx + 1) / total, desc=f"Processing {Path(file.name).name}...")
         ext = Path(file.name).suffix.lower()
+        # Extract text
         if ext == '.pdf':
+            # Extract text
             chunks = extract_pdf_text(file.name)
             for chunk in chunks:
                 for small_chunk in chunk_text(chunk['text']):
                         'source': chunk['source'],
                         'page': chunk.get('page', '')
                     })
+            # Extract images from PDF
+            pdf_images = extract_images_from_pdf(file.name)
+            for img in pdf_images:
+                images.append(img)
+                # Generate caption
+                caption = generate_image_caption(img['path'])
+                image_captions.append(caption)
         elif ext == '.docx':
             chunks = extract_docx_text(file.name)
             for chunk in chunks:
                 for small_chunk in chunk_text(chunk['text']):
+                    documents.append({
+                        'text': small_chunk,
+                        'source': chunk['source']
+                    })
         elif ext == '.txt':
             chunks = extract_txt_text(file.name)
             for chunk in chunks:
                 for small_chunk in chunk_text(chunk['text']):
+                    documents.append({
+                        'text': small_chunk,
+                        'source': chunk['source']
+                    })
+        elif ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
+            images.append({
+                'path': file.name,
+                'source': Path(file.name).name,
+                'page': ''
+            })
+            # Generate caption
+            caption = generate_image_caption(file.name)
+            image_captions.append(caption)
+    # Create embeddings for text
+    progress(0.9, desc="Creating embeddings...")
     if documents:
         texts = [doc['text'] for doc in documents]
+        embeddings = embedding_model.encode(texts, show_progress_bar=False)
         index = faiss.IndexFlatL2(embeddings.shape[1])
         index.add(embeddings.astype('float32'))
         embeddings_index = index
+    progress(1.0, desc="Done!")
+    return f"✅ Processed {len(documents)} text chunks and {len(images)} images"
 def search_documents(query, k=3):
+    """Search relevant documents"""
     if not documents or embeddings_index is None:
         return []
     return results
+def find_relevant_images(query, top_k=2):
+    """Find images relevant to query using captions"""
+    if not images or not image_captions:
+        return [], []
+    # Encode query and captions
+    query_embedding = embedding_model.encode([query])
+    caption_embeddings = embedding_model.encode(image_captions)
+    # Calculate similarity
+    similarities = np.dot(caption_embeddings, query_embedding.T).flatten()
+    # Get top k images
+    top_indices = np.argsort(similarities)[::-1][:top_k]
+    relevant_images = []
+    explanations = []
+    for idx in top_indices:
+        if idx < len(images):
+            img_info = images[idx]
+            caption = image_captions[idx]
+            relevant_images.append(img_info['path'])
+            # Create explanation
+            explanation = f"**Image from {img_info['source']}"
+            if img_info.get('page'):
+                explanation += f" (Page {img_info['page']})"
+            explanation += f"**\n{caption}"
+            explanations.append(explanation)
+    return relevant_images, explanations
 def generate_answer(question, context_docs):
+    """Generate answer using LLM"""
     context = '\n\n'.join([doc['text'] for doc in context_docs])
+    prompt = f"""Answer based on context:
 {context}
 Question: {question}
 Answer:"""
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1200)
     with torch.no_grad():
         outputs = llm_model.generate(
             inputs.input_ids,
+            max_new_tokens=200,
             temperature=0.7,
+            do_sample=True,
             pad_token_id=tokenizer.eos_token_id
         )
     return answer
+def answer_query(question, progress=gr.Progress()):
+    """Answer query with relevant images"""
     if not question:
+        return "Please enter a question", None, ""
     if not documents:
+        return "Please upload documents first", None, ""
+    # Search documents
+    progress(0.3, desc="Searching documents...")
+    relevant_docs = search_documents(question, k=3)
     if not relevant_docs:
+        return "No relevant info found", None, ""
     # Generate answer
+    progress(0.6, desc="Generating answer...")
     answer = generate_answer(question, relevant_docs)
     # Format response
+    response = f"## 💡 Answer:\n{answer}\n\n## 📚 Sources:\n"
     for i, doc in enumerate(relevant_docs, 1):
         source = doc['source']
         page = doc.get('page', '')
         else:
             response += f"{i}. {source}\n"
+    # Find relevant images
+    progress(0.9, desc="Finding relevant images...")
+    relevant_imgs, img_explanations = find_relevant_images(question, top_k=2)
+    # Add image explanations to response
+    if img_explanations:
+        response += f"\n## 🖼️ Related Images:\n"
+        for exp in img_explanations:
+            response += f"{exp}\n\n"
+    progress(1.0, desc="Done!")
+    return response, relevant_imgs if relevant_imgs else None, ""
 # UI
+with gr.Blocks(title="DocVision AI", theme=gr.themes.Soft()) as app:
+    gr.Markdown("""
+    # 📚 DocVision AI - Smart Document Q&A
+    Upload documents and ask questions to get AI-powered answers with relevant images
+    """)
     with gr.Row():
         with gr.Column():
             file_input = gr.File(
+                label="📁 Upload Files (PDF, DOCX, TXT, Images)",
                 file_count="multiple",
+                file_types=[".pdf", ".docx", ".txt", ".jpg", ".png", ".jpeg", ".gif"]
             )
+            process_btn = gr.Button("⚡ Process Documents", variant="primary", size="lg")
+            status = gr.Textbox(label="Status", lines=2)
         with gr.Column():
+            question = gr.Textbox(
+                label="❓ Ask a Question",
+                placeholder="What would you like to know?",
+                lines=3
+            )
+            ask_btn = gr.Button("🔍 Get Answer", variant="primary", size="lg")
+    answer = gr.Markdown(label="📝 Answer & Sources")
+    with gr.Row():
+        gallery = gr.Gallery(
+            label="🖼️ Relevant Images with Explanations",
+            columns=2,
+            height=400
+        )
+    gr.Markdown("### 📌 Example Questions:")
     gr.Examples(
         examples=[
             ["What is this document about?"],
             ["Summarize the main points"],
+            ["What are the key findings?"],
+            ["Show me information about diagrams or charts"]
         ],
         inputs=question
     )
+    debug_output = gr.Textbox(label="Debug Info", visible=False)
+    # Event handlers
+    process_btn.click(
+        process_files,
+        inputs=[file_input],
+        outputs=[status]
+    )
+    ask_btn.click(
+        answer_query,
+        inputs=[question],
+        outputs=[answer, gallery, debug_output]
+    )
+    question.submit(
+        answer_query,
+        inputs=[question],
+        outputs=[answer, gallery, debug_output]
+    )
 if __name__ == "__main__":
     app.launch()