Spaces:

Girinath11
/

DocVision_AI

Running

App Files Files Community

Girinath11 commited on Jan 28

Commit

f4a3eb8

verified ·

1 Parent(s): d13318b

Create app.py

Browse files

Files changed (1) hide show

app.py +377 -0

app.py ADDED Viewed

	@@ -0,0 +1,377 @@

+"""
+DocVision AI - Multimodal RAG System
+Smart Document & Image Question Answering with Text Extraction
+"""
+import gradio as gr
+import os
+from pathlib import Path
+import json
+import tempfile
+from PIL import Image
+import PyPDF2
+import docx
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+import torch
+# Initialize models
+print("Loading models...")
+embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+# Using a free LLM from Hugging Face
+tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
+llm_model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/phi-2",
+    torch_dtype=torch.float32,
+    trust_remote_code=True,
+    device_map="auto"
+)
+# Global storage
+document_store = {
+    'texts': [],
+    'images': [],
+    'metadata': [],
+    'embeddings': None,
+    'index': None
+}
+def extract_text_from_pdf(pdf_path):
+    """Extract text from PDF file"""
+    text_chunks = []
+    images = []
+    try:
+        with open(pdf_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            for page_num, page in enumerate(pdf_reader.pages):
+                text = page.extract_text()
+                if text.strip():
+                    text_chunks.append({
+                        'content': text,
+                        'page': page_num + 1,
+                        'type': 'text'
+                    })
+    except Exception as e:
+        print(f"Error extracting PDF: {e}")
+    return text_chunks, images
+def extract_text_from_docx(docx_path):
+    """Extract text from DOCX file"""
+    text_chunks = []
+    try:
+        doc = docx.Document(docx_path)
+        full_text = []
+        for para in doc.paragraphs:
+            if para.text.strip():
+                full_text.append(para.text)
+        text_chunks.append({
+            'content': '\n'.join(full_text),
+            'type': 'text'
+        })
+    except Exception as e:
+        print(f"Error extracting DOCX: {e}")
+    return text_chunks
+def extract_text_from_txt(txt_path):
+    """Extract text from TXT file"""
+    try:
+        with open(txt_path, 'r', encoding='utf-8') as file:
+            content = file.read()
+            return [{
+                'content': content,
+                'type': 'text'
+            }]
+    except Exception as e:
+        print(f"Error extracting TXT: {e}")
+        return []
+def process_image(image_path):
+    """Process and store image"""
+    try:
+        img = Image.open(image_path)
+        return {
+            'path': image_path,
+            'type': 'image'
+        }
+    except Exception as e:
+        print(f"Error processing image: {e}")
+        return None
+def chunk_text(text, chunk_size=500):
+    """Split text into smaller chunks"""
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), chunk_size):
+        chunk = ' '.join(words[i:i + chunk_size])
+        chunks.append(chunk)
+    return chunks
+def process_documents(files):
+    """Process uploaded documents"""
+    global document_store
+    if not files:
+        return "No files uploaded!"
+    # Reset document store
+    document_store = {
+        'texts': [],
+        'images': [],
+        'metadata': [],
+        'embeddings': None,
+        'index': None
+    }
+    total_texts = 0
+    total_images = 0
+    for file in files:
+        file_path = file.name
+        file_ext = Path(file_path).suffix.lower()
+        if file_ext == '.pdf':
+            text_chunks, images = extract_text_from_pdf(file_path)
+            for chunk in text_chunks:
+                # Split into smaller chunks
+                small_chunks = chunk_text(chunk['content'])
+                for sc in small_chunks:
+                    document_store['texts'].append(sc)
+                    document_store['metadata'].append({
+                        'source': Path(file_path).name,
+                        'page': chunk.get('page', 'N/A'),
+                        'type': 'text'
+                    })
+                    total_texts += 1
+        elif file_ext == '.docx':
+            text_chunks = extract_text_from_docx(file_path)
+            for chunk in text_chunks:
+                small_chunks = chunk_text(chunk['content'])
+                for sc in small_chunks:
+                    document_store['texts'].append(sc)
+                    document_store['metadata'].append({
+                        'source': Path(file_path).name,
+                        'type': 'text'
+                    })
+                    total_texts += 1
+        elif file_ext == '.txt':
+            text_chunks = extract_text_from_txt(file_path)
+            for chunk in text_chunks:
+                small_chunks = chunk_text(chunk['content'])
+                for sc in small_chunks:
+                    document_store['texts'].append(sc)
+                    document_store['metadata'].append({
+                        'source': Path(file_path).name,
+                        'type': 'text'
+                    })
+                    total_texts += 1
+        elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
+            img_data = process_image(file_path)
+            if img_data:
+                document_store['images'].append(img_data)
+                total_images += 1
+    # Create embeddings and index
+    if document_store['texts']:
+        embeddings = embedding_model.encode(document_store['texts'])
+        document_store['embeddings'] = embeddings
+        # Create FAISS index
+        dimension = embeddings.shape[1]
+        index = faiss.IndexFlatL2(dimension)
+        index.add(embeddings.astype('float32'))
+        document_store['index'] = index
+    return f"✅ Documents processed successfully!\n📄 Text chunks: {total_texts}\n🖼️ Images: {total_images}"
+def retrieve_relevant_context(query, top_k=3):
+    """Retrieve relevant text chunks for the query"""
+    if not document_store['texts'] or document_store['index'] is None:
+        return []
+    query_embedding = embedding_model.encode([query])
+    distances, indices = document_store['index'].search(query_embedding.astype('float32'), top_k)
+    relevant_chunks = []
+    for idx in indices[0]:
+        if idx < len(document_store['texts']):
+            relevant_chunks.append({
+                'text': document_store['texts'][idx],
+                'metadata': document_store['metadata'][idx]
+            })
+    return relevant_chunks
+def generate_answer(query, context_chunks):
+    """Generate answer using LLM"""
+    # Prepare context
+    context = "\n\n".join([chunk['text'] for chunk in context_chunks])
+    # Create prompt
+    prompt = f"""Based on the following context, answer the question accurately and concisely.
+Context:
+{context}
+Question: {query}
+Answer:"""
+    # Generate response
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
+    with torch.no_grad():
+        outputs = llm_model.generate(
+            inputs.input_ids,
+            max_new_tokens=300,
+            temperature=0.7,
+            do_sample=True,
+            top_p=0.9,
+            pad_token_id=tokenizer.eos_token_id
+        )
+    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract only the answer part
+    if "Answer:" in answer:
+        answer = answer.split("Answer:")[-1].strip()
+    return answer
+def find_relevant_images(query):
+    """Find relevant images based on query keywords"""
+    if not document_store['images']:
+        return []
+    # Simple keyword matching for images
+    # You can enhance this with image captioning models
+    return document_store['images'][:2]  # Return first 2 images for now
+def answer_question(query):
+    """Main function to answer questions"""
+    if not query.strip():
+        return "Please enter a question!", None
+    if not document_store['texts']:
+        return "Please upload documents first!", None
+    # Retrieve relevant context
+    relevant_chunks = retrieve_relevant_context(query, top_k=3)
+    if not relevant_chunks:
+        return "No relevant information found in the documents.", None
+    # Generate answer
+    answer = generate_answer(query, relevant_chunks)
+    # Find relevant images
+    relevant_images = find_relevant_images(query)
+    # Prepare response
+    response = f"**Answer:**\n{answer}\n\n"
+    response += f"\n**Sources:**\n"
+    for i, chunk in enumerate(relevant_chunks, 1):
+        source = chunk['metadata'].get('source', 'Unknown')
+        page = chunk['metadata'].get('page', '')
+        if page:
+            response += f"{i}. {source} (Page {page})\n"
+        else:
+            response += f"{i}. {source}\n"
+    # Return images if available
+    image_outputs = None
+    if relevant_images:
+        image_outputs = [img['path'] for img in relevant_images]
+    return response, image_outputs
+# Create Gradio interface
+with gr.Blocks(title="📚 DocVision AI - Multimodal RAG", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 📚 DocVision AI
+    ### *Smart Document & Image Question Answering with Multimodal RAG*
+    Extract text from documents, upload images, and ask intelligent questions!
+    **How to use:**
+    1. 📤 **Upload** your documents (PDF, DOCX, TXT) and images (JPG, PNG)
+    2. ⚡ **Process** to extract and index content
+    3. 💬 **Ask** questions and get accurate answers with relevant images!
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_upload = gr.File(
+                label="📁 Upload Documents & Images",
+                file_count="multiple",
+                file_types=[".pdf", ".docx", ".txt", ".jpg", ".jpeg", ".png"]
+            )
+            process_btn = gr.Button("⚡ Process Documents", variant="primary", size="lg")
+            status_output = gr.Textbox(label="📊 Processing Status", lines=3)
+        with gr.Column(scale=1):
+            gr.Markdown("### 💬 Ask Your Questions")
+            question_input = gr.Textbox(
+                label="Your Question",
+                placeholder="What would you like to know about your documents?",
+                lines=3
+            )
+            ask_btn = gr.Button("🔍 Get Answer", variant="primary", size="lg")
+    with gr.Row():
+        answer_output = gr.Markdown(label="📝 Answer & Sources")
+    with gr.Row():
+        image_output = gr.Gallery(
+            label="🖼️ Relevant Images from Documents",
+            columns=2,
+            height="auto"
+        )
+    # Example questions
+    gr.Markdown("### 📌 Try These Example Questions:")
+    gr.Examples(
+        examples=[
+            ["What is the main topic of this document?"],
+            ["Summarize the key points mentioned"],
+            ["What are the important dates or numbers mentioned?"],
+            ["List the main findings or conclusions"],
+        ],
+        inputs=question_input
+    )
+    gr.Markdown("""
+    ---
+    **Powered by:** 🤗 Hugging Face | Microsoft Phi-2 | Sentence Transformers | FAISS
+    """)
+    # Event handlers
+    process_btn.click(
+        fn=process_documents,
+        inputs=[file_upload],
+        outputs=[status_output]
+    )
+    ask_btn.click(
+        fn=answer_question,
+        inputs=[question_input],
+        outputs=[answer_output, image_output]
+    )
+    question_input.submit(
+        fn=answer_question,
+        inputs=[question_input],
+        outputs=[answer_output, image_output]
+    )
+if __name__ == "__main__":
+    demo.launch()