Spaces:

pranshh
/

ocr-assignment

Runtime error

App Files Files Community

pranshh commited on Sep 30, 2024

Commit

28c861d

verified ·

1 Parent(s): dd3183f

Uploaded app.py and requirements.txt

Browse files

Files changed (2) hide show

app.py.py +157 -0
requirements.txt +8 -0

app.py.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# -*- coding: utf-8 -*-
+"""OCR Web Application Prototype.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1vzsQ17-W1Vy6yJ60XUwFy0QRkOR_SIg7
+"""
+import gradio as gr
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+import os
+revision = "5364fe1ab774ef13c2c79023dc91d8c1e7cfdce4"
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True)
+model = AutoModel.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True, low_cpu_mem_usage=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
+model = model.eval()
+# Function to perform OCR and optional keyword search
+def process_image_with_search(image, keyword):
+    try:
+        # Save the PIL image to a temporary file
+        temp_img_path = "temp_image.png"
+        image.save(temp_img_path)
+        # Perform OCR with the model using the file path
+        extracted_text = model.chat(tokenizer, temp_img_path, ocr_type='format')
+        # Delete the temporary file
+        if os.path.exists(temp_img_path):
+            os.remove(temp_img_path)
+        # Convert extracted text to string if it's not already
+        extracted_text = extracted_text if isinstance(extracted_text, str) else str(extracted_text)
+        # If a keyword is provided, search for it
+        if keyword:
+            # Perform keyword search (case-insensitive)
+            if keyword.lower() in extracted_text.lower():
+                # Highlight the keyword in the extracted text
+                highlighted_text = extracted_text.replace(keyword, f"**{keyword}**", 1)  # Highlight first occurrence
+                result = f"Keyword '{keyword}' found:\n\n{highlighted_text}"
+            else:
+                result = f"Keyword '{keyword}' not found in the extracted text.\n\nExtracted Text:\n{extracted_text}"
+        else:
+            # If no keyword is provided, return the extracted text without searching
+            result = f"Extracted Text:\n\n{extracted_text}"
+        return result
+    except Exception as e:
+        return str(e)  # Return error message in case of failure
+# Define Gradio interface
+iface = gr.Interface(
+    fn=process_image_with_search,  # The function to process the image and search keyword
+    inputs=[gr.Image(type='pil'), gr.Textbox(label="Enter keyword to search (optional)")],  # Image input + Keyword input
+    outputs='text',  # Output will be plain text with the search result
+    title="OCR with GOT and Keyword Search",
+    description="Upload an image to get OCR results. You can also search for a keyword in the extracted text."
+)
+# Launch the interface
+iface.launch(debug=True)
+# !pip install --upgrade git+https://github.com/huggingface/transformers.git byaldi accelerate flash-attn qwen_vl_utils pdf2image gradio
+# !sudo apt-get install -y poppler-utils
+# from byaldi import RAGMultiModalModel
+# from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+# from qwen_vl_utils import process_vision_info
+# import torch
+# import gradio as gr
+# from PIL import Image
+# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
+# # Initialize the model with float16 precision and handle fallback to CPU
+# def load_model():
+#     try:
+#         vlm = Qwen2VLForConditionalGeneration.from_pretrained(
+#             "Qwen/Qwen2-VL-2B-Instruct",
+#             torch_dtype=torch.float16,
+#             attn_implementation="flash_attention_2",  # FlashAttention enabled
+#             device_map="cuda"
+#         )
+#         print("Model loaded with FlashAttention on GPU")
+#     except RuntimeError as e:
+#         if "FlashAttention only supports Ampere GPUs" in str(e):
+#             print("FlashAttention not supported. Falling back to standard attention.")
+#             vlm = Qwen2VLForConditionalGeneration.from_pretrained(
+#                 "Qwen/Qwen2-VL-2B-Instruct",
+#                 torch_dtype=torch.float16,  # Still use float16 to save memory
+#                 attn_implementation="default",  # Use standard attention mechanism
+#                 device_map="cuda" if torch.cuda.is_available() else "cpu"
+#             )
+#         else:
+#             raise e  # Raise other runtime errors if not related to FlashAttention
+#     return vlm
+# # Load the model
+# vlm = load_model()
+# # OCR function to extract text from an image
+# def ocr_image(image, query="Extract text from the image"):
+#     messages = [
+#         {
+#             "role": "user",
+#             "content": [
+#                 {
+#                     "type": "image",
+#                     "image": image,
+#                 },
+#                 {"type": "text", "text": query},
+#             ],
+#         }
+#     ]
+#     # Prepare inputs for the model
+#     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+#     image_inputs, video_inputs = process_vision_info(messages)
+#     inputs = processor(
+#         text=[text],
+#         images=image_inputs,
+#         videos=video_inputs,
+#         padding=True,
+#         return_tensors="pt",
+#     )
+#     inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
+#     # Generate the output text using the model
+#     generated_ids = vlm.generate(**inputs, max_new_tokens=512)
+#     generated_ids_trimmed = [
+#         out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+#     ]
+#     output_text = processor.batch_decode(
+#         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+#     )
+#     return output_text[0]
+# # Gradio interface
+# def process_image(image):
+#     return ocr_image(image)
+# # Create Gradio interface for uploading an image
+# interface = gr.Interface(
+#     fn=process_image,
+#     inputs=gr.Image(type="pil"),
+#     outputs="text",
+#     title="Hindi & English OCR",
+#     description="Upload an image containing text in Hindi or English to extract the text using OCR."
+# )
+# # Launch Gradio interface in Colab
+# interface.launch(share=True, debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch==2.0.1
+torchvision==0.15.2
+transformers==4.37.2
+tiktoken==0.6.0
+verovio==4.3.1
+accelerate==0.28.0
+gradio