Spaces:

Girinath11
/

DocVision_AI

Sleeping

App Files Files Community

Girinath11 commited on Jan 28

Commit

e789621

verified ·

1 Parent(s): 1fbb18c

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -288

app.py CHANGED Viewed

@@ -1,377 +1,210 @@
-"""
-DocVision AI - Multimodal RAG System
-Smart Document & Image Question Answering with Text Extraction
-"""
 import gradio as gr
-import os
 from pathlib import Path
-import json
-import tempfile
 from PIL import Image
 import PyPDF2
 import docx
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import torch
-# Initialize models
 print("Loading models...")
 embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-# Using a free LLM from Hugging Face
-tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
 llm_model = AutoModelForCausalLM.from_pretrained(
-    "microsoft/phi-2",
-    torch_dtype=torch.float32,
-    trust_remote_code=True,
     device_map="auto"
 )
-# Global storage
-document_store = {
-    'texts': [],
-    'images': [],
-    'metadata': [],
-    'embeddings': None,
-    'index': None
-}
-def extract_text_from_pdf(pdf_path):
-    """Extract text from PDF file"""
-    text_chunks = []
-    images = []
-    try:
-        with open(pdf_path, 'rb') as file:
-            pdf_reader = PyPDF2.PdfReader(file)
-            for page_num, page in enumerate(pdf_reader.pages):
-                text = page.extract_text()
-                if text.strip():
-                    text_chunks.append({
-                        'content': text,
-                        'page': page_num + 1,
-                        'type': 'text'
-                    })
-    except Exception as e:
-        print(f"Error extracting PDF: {e}")
-    return text_chunks, images
-def extract_text_from_docx(docx_path):
-    """Extract text from DOCX file"""
-    text_chunks = []
-    try:
-        doc = docx.Document(docx_path)
-        full_text = []
-        for para in doc.paragraphs:
-            if para.text.strip():
-                full_text.append(para.text)
-        text_chunks.append({
-            'content': '\n'.join(full_text),
-            'type': 'text'
-        })
-    except Exception as e:
-        print(f"Error extracting DOCX: {e}")
-    return text_chunks
-def extract_text_from_txt(txt_path):
-    """Extract text from TXT file"""
-    try:
-        with open(txt_path, 'r', encoding='utf-8') as file:
-            content = file.read()
-            return [{
-                'content': content,
-                'type': 'text'
-            }]
-    except Exception as e:
-        print(f"Error extracting TXT: {e}")
-        return []
-def process_image(image_path):
-    """Process and store image"""
-    try:
-        img = Image.open(image_path)
-        return {
-            'path': image_path,
-            'type': 'image'
-        }
-    except Exception as e:
-        print(f"Error processing image: {e}")
-        return None
-def chunk_text(text, chunk_size=500):
-    """Split text into smaller chunks"""
     words = text.split()
     chunks = []
-    for i in range(0, len(words), chunk_size):
-        chunk = ' '.join(words[i:i + chunk_size])
-        chunks.append(chunk)
     return chunks
-def process_documents(files):
-    """Process uploaded documents"""
-    global document_store
     if not files:
-        return "No files uploaded!"
-    # Reset document store
-    document_store = {
-        'texts': [],
-        'images': [],
-        'metadata': [],
-        'embeddings': None,
-        'index': None
-    }
-    total_texts = 0
-    total_images = 0
     for file in files:
-        file_path = file.name
-        file_ext = Path(file_path).suffix.lower()
-        if file_ext == '.pdf':
-            text_chunks, images = extract_text_from_pdf(file_path)
-            for chunk in text_chunks:
-                # Split into smaller chunks
-                small_chunks = chunk_text(chunk['content'])
-                for sc in small_chunks:
-                    document_store['texts'].append(sc)
-                    document_store['metadata'].append({
-                        'source': Path(file_path).name,
-                        'page': chunk.get('page', 'N/A'),
-                        'type': 'text'
-                    })
-                    total_texts += 1
-        elif file_ext == '.docx':
-            text_chunks = extract_text_from_docx(file_path)
-            for chunk in text_chunks:
-                small_chunks = chunk_text(chunk['content'])
-                for sc in small_chunks:
-                    document_store['texts'].append(sc)
-                    document_store['metadata'].append({
-                        'source': Path(file_path).name,
-                        'type': 'text'
                     })
-                    total_texts += 1
-        elif file_ext == '.txt':
-            text_chunks = extract_text_from_txt(file_path)
-            for chunk in text_chunks:
-                small_chunks = chunk_text(chunk['content'])
-                for sc in small_chunks:
-                    document_store['texts'].append(sc)
-                    document_store['metadata'].append({
-                        'source': Path(file_path).name,
-                        'type': 'text'
-                    })
-                    total_texts += 1
-        elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
-            img_data = process_image(file_path)
-            if img_data:
-                document_store['images'].append(img_data)
-                total_images += 1
-    # Create embeddings and index
-    if document_store['texts']:
-        embeddings = embedding_model.encode(document_store['texts'])
-        document_store['embeddings'] = embeddings
-        # Create FAISS index
-        dimension = embeddings.shape[1]
-        index = faiss.IndexFlatL2(dimension)
         index.add(embeddings.astype('float32'))
-        document_store['index'] = index
-    return f"✅ Documents processed successfully!\n📄 Text chunks: {total_texts}\n🖼️ Images: {total_images}"
-def retrieve_relevant_context(query, top_k=3):
-    """Retrieve relevant text chunks for the query"""
-    if not document_store['texts'] or document_store['index'] is None:
         return []
-    query_embedding = embedding_model.encode([query])
-    distances, indices = document_store['index'].search(query_embedding.astype('float32'), top_k)
-    relevant_chunks = []
     for idx in indices[0]:
-        if idx < len(document_store['texts']):
-            relevant_chunks.append({
-                'text': document_store['texts'][idx],
-                'metadata': document_store['metadata'][idx]
-            })
-    return relevant_chunks
-def generate_answer(query, context_chunks):
-    """Generate answer using LLM"""
-    # Prepare context
-    context = "\n\n".join([chunk['text'] for chunk in context_chunks])
-    # Create prompt
-    prompt = f"""Based on the following context, answer the question accurately and concisely.
-Context:
 {context}
-Question: {query}
 Answer:"""
-    # Generate response
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
     with torch.no_grad():
         outputs = llm_model.generate(
             inputs.input_ids,
-            max_new_tokens=300,
             temperature=0.7,
-            do_sample=True,
             top_p=0.9,
             pad_token_id=tokenizer.eos_token_id
         )
     answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extract only the answer part
-    if "Answer:" in answer:
-        answer = answer.split("Answer:")[-1].strip()
     return answer
-def find_relevant_images(query):
-    """Find relevant images based on query keywords"""
-    if not document_store['images']:
-        return []
-    # Simple keyword matching for images
-    # You can enhance this with image captioning models
-    return document_store['images'][:2]  # Return first 2 images for now
-def answer_question(query):
-    """Main function to answer questions"""
-    if not query.strip():
-        return "Please enter a question!", None
-    if not document_store['texts']:
-        return "Please upload documents first!", None
-    # Retrieve relevant context
-    relevant_chunks = retrieve_relevant_context(query, top_k=3)
-    if not relevant_chunks:
-        return "No relevant information found in the documents.", None
     # Generate answer
-    answer = generate_answer(query, relevant_chunks)
-    # Find relevant images
-    relevant_images = find_relevant_images(query)
-    # Prepare response
-    response = f"**Answer:**\n{answer}\n\n"
-    response += f"\n**Sources:**\n"
-    for i, chunk in enumerate(relevant_chunks, 1):
-        source = chunk['metadata'].get('source', 'Unknown')
-        page = chunk['metadata'].get('page', '')
         if page:
             response += f"{i}. {source} (Page {page})\n"
         else:
             response += f"{i}. {source}\n"
     # Return images if available
-    image_outputs = None
-    if relevant_images:
-        image_outputs = [img['path'] for img in relevant_images]
-    return response, image_outputs
-# Create Gradio interface
-with gr.Blocks(title="📚 DocVision AI - Multimodal RAG", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 📚 DocVision AI
-    ### *Smart Document & Image Question Answering with Multimodal RAG*
-    Extract text from documents, upload images, and ask intelligent questions!
-    **How to use:**
-    1. 📤 **Upload** your documents (PDF, DOCX, TXT) and images (JPG, PNG)
-    2. ⚡ **Process** to extract and index content
-    3. 💬 **Ask** questions and get accurate answers with relevant images!
-    """)
     with gr.Row():
-        with gr.Column(scale=1):
-            file_upload = gr.File(
-                label="📁 Upload Documents & Images",
                 file_count="multiple",
-                file_types=[".pdf", ".docx", ".txt", ".jpg", ".jpeg", ".png"]
-            )
-            process_btn = gr.Button("⚡ Process Documents", variant="primary", size="lg")
-            status_output = gr.Textbox(label="📊 Processing Status", lines=3)
-        with gr.Column(scale=1):
-            gr.Markdown("### 💬 Ask Your Questions")
-            question_input = gr.Textbox(
-                label="Your Question",
-                placeholder="What would you like to know about your documents?",
-                lines=3
             )
-            ask_btn = gr.Button("🔍 Get Answer", variant="primary", size="lg")
-    with gr.Row():
-        answer_output = gr.Markdown(label="📝 Answer & Sources")
-    with gr.Row():
-        image_output = gr.Gallery(
-            label="🖼️ Relevant Images from Documents",
-            columns=2,
-            height="auto"
-        )
-    # Example questions
-    gr.Markdown("### 📌 Try These Example Questions:")
     gr.Examples(
         examples=[
-            ["What is the main topic of this document?"],
-            ["Summarize the key points mentioned"],
-            ["What are the important dates or numbers mentioned?"],
-            ["List the main findings or conclusions"],
         ],
-        inputs=question_input
     )
-    gr.Markdown("""
-    ---
-    **Powered by:** 🤗 Hugging Face | Microsoft Phi-2 | Sentence Transformers | FAISS
-    """)
-    # Event handlers
-    process_btn.click(
-        fn=process_documents,
-        inputs=[file_upload],
-        outputs=[status_output]
-    )
-    ask_btn.click(
-        fn=answer_question,
-        inputs=[question_input],
-        outputs=[answer_output, image_output]
-    )
-    question_input.submit(
-        fn=answer_question,
-        inputs=[question_input],
-        outputs=[answer_output, image_output]
-    )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from pathlib import Path
 from PIL import Image
 import PyPDF2
 import docx
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+from datetime import datetime
+# Load models
 print("Loading models...")
 embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 llm_model = AutoModelForCausalLM.from_pretrained(
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    torch_dtype=torch.float16,
     device_map="auto"
 )
+# Store documents
+documents = []
+images = []
+embeddings_index = None
+def extract_pdf_text(pdf_path):
+    chunks = []
+    with open(pdf_path, 'rb') as f:
+        pdf = PyPDF2.PdfReader(f)
+        for i, page in enumerate(pdf.pages):
+            text = page.extract_text()
+            if text.strip():
+                chunks.append({'text': text, 'page': i+1, 'source': Path(pdf_path).name})
+    return chunks
+def extract_docx_text(docx_path):
+    doc = docx.Document(docx_path)
+    text = '\n'.join([p.text for p in doc.paragraphs if p.text.strip()])
+    return [{'text': text, 'source': Path(docx_path).name}]
+def extract_txt_text(txt_path):
+    with open(txt_path, 'r', encoding='utf-8') as f:
+        text = f.read()
+    return [{'text': text, 'source': Path(txt_path).name}]
+def chunk_text(text, size=400):
     words = text.split()
     chunks = []
+    for i in range(0, len(words), size):
+        chunks.append(' '.join(words[i:i+size]))
     return chunks
+def process_files(files):
+    global documents, images, embeddings_index
     if not files:
+        return "Please upload files first"
+    documents = []
+    images = []
     for file in files:
+        ext = Path(file.name).suffix.lower()
+        if ext == '.pdf':
+            chunks = extract_pdf_text(file.name)
+            for chunk in chunks:
+                for small_chunk in chunk_text(chunk['text']):
+                    documents.append({
+                        'text': small_chunk,
+                        'source': chunk['source'],
+                        'page': chunk.get('page', '')
                     })
+        elif ext == '.docx':
+            chunks = extract_docx_text(file.name)
+            for chunk in chunks:
+                for small_chunk in chunk_text(chunk['text']):
+                    documents.append({'text': small_chunk, 'source': chunk['source']})
+        elif ext == '.txt':
+            chunks = extract_txt_text(file.name)
+            for chunk in chunks:
+                for small_chunk in chunk_text(chunk['text']):
+                    documents.append({'text': small_chunk, 'source': chunk['source']})
+        elif ext in ['.jpg', '.jpeg', '.png']:
+            images.append(file.name)
+    # Create embeddings
+    if documents:
+        texts = [doc['text'] for doc in documents]
+        embeddings = embedding_model.encode(texts)
+        index = faiss.IndexFlatL2(embeddings.shape[1])
         index.add(embeddings.astype('float32'))
+        embeddings_index = index
+    return f"Processed {len(documents)} text chunks and {len(images)} images"
+def search_documents(query, k=3):
+    if not documents or embeddings_index is None:
         return []
+    query_vec = embedding_model.encode([query])
+    distances, indices = embeddings_index.search(query_vec.astype('float32'), k)
+    results = []
     for idx in indices[0]:
+        if idx < len(documents):
+            results.append(documents[idx])
+    return results
+def generate_answer(question, context_docs):
+    context = '\n\n'.join([doc['text'] for doc in context_docs])
+    prompt = f"""Answer the question based on this context:
 {context}
+Question: {question}
 Answer:"""
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
     with torch.no_grad():
         outputs = llm_model.generate(
             inputs.input_ids,
+            max_new_tokens=250,
             temperature=0.7,
             top_p=0.9,
             pad_token_id=tokenizer.eos_token_id
         )
     answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    answer = answer.split("Answer:")[-1].strip()
     return answer
+def answer_query(question):
+    if not question:
+        return "Please enter a question", None
+    if not documents:
+        return "Please upload documents first", None
+    # Search relevant docs
+    relevant_docs = search_documents(question)
+    if not relevant_docs:
+        return "No relevant info found", None
     # Generate answer
+    answer = generate_answer(question, relevant_docs)
+    # Format response
+    response = f"**Answer:**\n{answer}\n\n**Sources:**\n"
+    for i, doc in enumerate(relevant_docs, 1):
+        source = doc['source']
+        page = doc.get('page', '')
         if page:
             response += f"{i}. {source} (Page {page})\n"
         else:
             response += f"{i}. {source}\n"
     # Return images if available
+    imgs = images[:2] if images else None
+    return response, imgs
+# UI
+with gr.Blocks(title="DocVision AI") as app:
+    gr.Markdown("# DocVision AI - Document Q&A System")
+    gr.Markdown("Upload documents and ask questions to get AI-powered answers")
     with gr.Row():
+        with gr.Column():
+            file_input = gr.File(
+                label="Upload Files (PDF, DOCX, TXT, Images)",
                 file_count="multiple",
+                file_types=[".pdf", ".docx", ".txt", ".jpg", ".png"]
             )
+            process_btn = gr.Button("Process Documents", variant="primary")
+            status = gr.Textbox(label="Status")
+        with gr.Column():
+            question = gr.Textbox(label="Ask a Question", lines=2)
+            ask_btn = gr.Button("Get Answer", variant="primary")
+    answer = gr.Markdown(label="Answer")
+    gallery = gr.Gallery(label="Related Images", columns=2)
     gr.Examples(
         examples=[
+            ["What is this document about?"],
+            ["Summarize the main points"],
+            ["What are the key findings?"]
         ],
+        inputs=question
     )
+    process_btn.click(process_files, inputs=[file_input], outputs=[status])
+    ask_btn.click(answer_query, inputs=[question], outputs=[answer, gallery])
+    question.submit(answer_query, inputs=[question], outputs=[answer, gallery])
 if __name__ == "__main__":
+    app.launch()