Spaces:

Tanish28
/

Text_Extract

Sleeping

App Files Files Community

Tanish28 commited on Feb 20, 2025

Commit

09bde2e

verified ·

1 Parent(s): 0b28163

Create app.py

Browse files

Files changed (1) hide show

app.py +98 -0

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+import io
+from pdf2image import convert_from_path
+from openai import OpenAI
+import base64
+import asyncio
+from datetime import datetime
+import gradio as gr
+# We'll use an environment variable for the API key in Spaces
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+class PDFTextExtractor:
+    def __init__(self, api_key):
+        self.client = OpenAI(api_key=api_key)
+    async def extract_text_from_pdf(self, pdf_path):
+        try:
+            if not os.path.exists(pdf_path):
+                raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+            print(f"Processing PDF: {pdf_path}")
+            images = convert_from_path(pdf_path)
+            extracted_texts = []
+            for i, image in enumerate(images):
+                print(f"Processing page {i+1}...")
+                img_buffer = io.BytesIO()
+                image.save(img_buffer, format='PNG')
+                img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
+                response = self.client.chat.completions.create(
+                    model="gpt-4-vision-preview",
+                    messages=[
+                        {
+                            "role": "system",
+                            "content": "Extract ALL text from this image exactly as it appears, preserving all formatting, numbers, and special characters. Include everything you can see, from headers to footers, timestamps to footnotes. Also include the tickmarks present in the forms."
+                        },
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "Please extract and transcribe ALL text visible in this image, exactly as it appears. Include every piece of text you can see, maintaining the exact formatting, spacing, and line breaks."
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {
+                                        "url": f"data:image/png;base64,{img_base64}"
+                                    }
+                                }
+                            ]
+                        }
+                    ],
+                    max_tokens=4096
+                )
+                extracted_texts.append({
+                    'page': i + 1,
+                    'text': response.choices[0].message.content
+                })
+            return extracted_texts
+        except Exception as e:
+            print(f"Error in text extraction: {str(e)}")
+            return None
+def extract_text(pdf_file):
+    if OPENAI_API_KEY is None:
+        return "Error: OpenAI API key not found. Please set the OPENAI_API_KEY environment variable."
+    extractor = PDFTextExtractor(OPENAI_API_KEY)
+    pdf_path = pdf_file.name
+    extracted_texts = asyncio.run(extractor.extract_text_from_pdf(pdf_path))
+    if extracted_texts:
+        output = ""
+        for page in extracted_texts:
+            output += f"\n\n=== Page {page['page']} ===\n\n"
+            output += page['text']
+        return output
+    else:
+        return "Failed to extract text from PDF"
+iface = gr.Interface(
+    fn=extract_text,
+    inputs=gr.File(label="Upload PDF"),
+    outputs="text",
+    title="PDF Text Extractor",
+    description="Upload a PDF file to extract all text using OpenAI's GPT-4 Vision."
+)
+iface.launch()