Tanish28 commited on
Commit
09bde2e
·
verified ·
1 Parent(s): 0b28163

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ from pdf2image import convert_from_path
4
+ from openai import OpenAI
5
+ import base64
6
+ import asyncio
7
+ from datetime import datetime
8
+ import gradio as gr
9
+
10
+ # We'll use an environment variable for the API key in Spaces
11
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
12
+
13
+ class PDFTextExtractor:
14
+ def __init__(self, api_key):
15
+ self.client = OpenAI(api_key=api_key)
16
+
17
+ async def extract_text_from_pdf(self, pdf_path):
18
+ try:
19
+ if not os.path.exists(pdf_path):
20
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
21
+
22
+ print(f"Processing PDF: {pdf_path}")
23
+
24
+ images = convert_from_path(pdf_path)
25
+
26
+ extracted_texts = []
27
+ for i, image in enumerate(images):
28
+ print(f"Processing page {i+1}...")
29
+
30
+ img_buffer = io.BytesIO()
31
+ image.save(img_buffer, format='PNG')
32
+ img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
33
+
34
+ response = self.client.chat.completions.create(
35
+ model="gpt-4-vision-preview",
36
+ messages=[
37
+ {
38
+ "role": "system",
39
+ "content": "Extract ALL text from this image exactly as it appears, preserving all formatting, numbers, and special characters. Include everything you can see, from headers to footers, timestamps to footnotes. Also include the tickmarks present in the forms."
40
+ },
41
+ {
42
+ "role": "user",
43
+ "content": [
44
+ {
45
+ "type": "text",
46
+ "text": "Please extract and transcribe ALL text visible in this image, exactly as it appears. Include every piece of text you can see, maintaining the exact formatting, spacing, and line breaks."
47
+ },
48
+ {
49
+ "type": "image_url",
50
+ "image_url": {
51
+ "url": f"data:image/png;base64,{img_base64}"
52
+ }
53
+ }
54
+ ]
55
+ }
56
+ ],
57
+ max_tokens=4096
58
+ )
59
+
60
+ extracted_texts.append({
61
+ 'page': i + 1,
62
+ 'text': response.choices[0].message.content
63
+ })
64
+
65
+ return extracted_texts
66
+
67
+ except Exception as e:
68
+ print(f"Error in text extraction: {str(e)}")
69
+ return None
70
+
71
+ def extract_text(pdf_file):
72
+ if OPENAI_API_KEY is None:
73
+ return "Error: OpenAI API key not found. Please set the OPENAI_API_KEY environment variable."
74
+
75
+ extractor = PDFTextExtractor(OPENAI_API_KEY)
76
+
77
+ pdf_path = pdf_file.name
78
+ extracted_texts = asyncio.run(extractor.extract_text_from_pdf(pdf_path))
79
+
80
+ if extracted_texts:
81
+ output = ""
82
+ for page in extracted_texts:
83
+ output += f"\n\n=== Page {page['page']} ===\n\n"
84
+ output += page['text']
85
+
86
+ return output
87
+ else:
88
+ return "Failed to extract text from PDF"
89
+
90
+ iface = gr.Interface(
91
+ fn=extract_text,
92
+ inputs=gr.File(label="Upload PDF"),
93
+ outputs="text",
94
+ title="PDF Text Extractor",
95
+ description="Upload a PDF file to extract all text using OpenAI's GPT-4 Vision."
96
+ )
97
+
98
+ iface.launch()