Spaces:

MawaredHR
/

Vision_tester

Sleeping

App Files Files Community

Daemontatox commited on Jan 17, 2025

Commit

e4611cf

verified ·

1 Parent(s): e574b9a

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -22

app.py CHANGED Viewed

@@ -7,29 +7,34 @@ import gradio as gr
 from gradio import FileData
 import time
 import spaces
-from pdf2image import convert_from_path
-import os
-from PyPDF2 import PdfReader
-import tempfile
 ckpt = "Daemontatox/DocumentCogito"
 model = MllamaForConditionalGeneration.from_pretrained(ckpt,
     torch_dtype=torch.bfloat16).to("cuda")
 processor = AutoProcessor.from_pretrained(ckpt)
-def process_pdf(pdf_path):
-    """Convert PDF pages to images and extract text."""
-    images = convert_from_path(pdf_path)
-    pdf_reader = PdfReader(pdf_path)
     text = ""
-    for page in pdf_reader.pages:
-        text += page.extract_text() + "\n"
     return images, text
-def is_pdf(file_path):
-    """Check if the file is a PDF."""
-    return file_path.lower().endswith('.pdf')
 @spaces.GPU()
 def bot_streaming(message, history, max_new_tokens=2048):
     txt = message["text"]
@@ -41,7 +46,7 @@ def bot_streaming(message, history, max_new_tokens=2048):
     # Process history
     for i, msg in enumerate(history):
         if isinstance(msg[0], tuple):
-            messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "text", "text": history[i+1][1]}]})
             messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
             images.append(Image.open(msg[0][0]).convert("RGB"))
         elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
@@ -52,11 +57,13 @@ def bot_streaming(message, history, max_new_tokens=2048):
     # Process current message
     if len(message["files"]) == 1:
-        file_path = message["files"][0]["path"] if isinstance(message["files"][0], dict) else message["files"][0]
-        if is_pdf(file_path):
-            # Handle PDF
-            pdf_images, pdf_text = process_pdf(file_path)
             images.extend(pdf_images)
             txt = f"{txt}\nExtracted text from PDF:\n{pdf_text}"
         else:
@@ -73,11 +80,16 @@ def bot_streaming(message, history, max_new_tokens=2048):
     if not images:
         inputs = processor(text=texts, return_tensors="pt").to("cuda")
     else:
         inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
     streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
-    generated_text = ""
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
@@ -85,10 +97,10 @@ def bot_streaming(message, history, max_new_tokens=2048):
     for new_text in streamer:
         buffer += new_text
-        generated_text_without_prompt = buffer
         time.sleep(0.01)
         yield buffer
 demo = gr.ChatInterface(
     fn=bot_streaming,
     title="Document Analyzer",
@@ -116,7 +128,8 @@ demo = gr.ChatInterface(
     multimodal=True
 )
-# Update file types to include PDFs
 demo.textbox.file_types = ["image", "pdf"]
 demo.launch(debug=True)

 from gradio import FileData
 import time
 import spaces
+import fitz  # PyMuPDF
+import io
+import numpy as np
 ckpt = "Daemontatox/DocumentCogito"
 model = MllamaForConditionalGeneration.from_pretrained(ckpt,
     torch_dtype=torch.bfloat16).to("cuda")
 processor = AutoProcessor.from_pretrained(ckpt)
+def process_pdf_file(file_path):
+    """Convert PDF to images and extract text using PyMuPDF."""
+    doc = fitz.open(file_path)
+    images = []
     text = ""
+    for page in doc:
+        # Extract text
+        text += page.get_text() + "\n"
+        # Convert page to image
+        pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # 300 DPI
+        img_data = pix.tobytes("png")
+        img = Image.open(io.BytesIO(img_data))
+        images.append(img.convert("RGB"))
+    doc.close()
     return images, text
 @spaces.GPU()
 def bot_streaming(message, history, max_new_tokens=2048):
     txt = message["text"]
     # Process history
     for i, msg in enumerate(history):
         if isinstance(msg[0], tuple):
+            messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
             messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
             images.append(Image.open(msg[0][0]).convert("RGB"))
         elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
     # Process current message
     if len(message["files"]) == 1:
+        file_data = message["files"][0]
+        file_path = file_data["path"] if isinstance(file_data, dict) else file_data
+        # Check if file is PDF
+        if file_path.lower().endswith('.pdf'):
+            # Process PDF
+            pdf_images, pdf_text = process_pdf_file(file_path)
             images.extend(pdf_images)
             txt = f"{txt}\nExtracted text from PDF:\n{pdf_text}"
         else:
     if not images:
         inputs = processor(text=texts, return_tensors="pt").to("cuda")
     else:
+        # Handle multiple images if needed
+        max_images = 4  # Limit number of images to process
+        if len(images) > max_images:
+            images = images[:max_images]
+            txt += f"\n(Note: Only processing first {max_images} pages of the PDF)"
         inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
     streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
         yield buffer
+# Create the Gradio interface
 demo = gr.ChatInterface(
     fn=bot_streaming,
     title="Document Analyzer",
     multimodal=True
 )
+# Update accepted file types
 demo.textbox.file_types = ["image", "pdf"]
+# Launch the interface
 demo.launch(debug=True)