Vladt-Tempest commited on
Commit
98e4ea6
·
1 Parent(s): 5f046ba

With OCR working ver 1

Browse files
Files changed (3) hide show
  1. app.py +36 -6
  2. packages.txt +1 -0
  3. requirements.txt +3 -54
app.py CHANGED
@@ -1,10 +1,40 @@
 
 
 
1
  import gradio as gr
2
- import sys
3
 
4
- def greet(name):
5
- python_version = sys.version.split()[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- return f"Hello {name}!!\nPython version: {python_version}"
8
 
9
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
10
- demo.launch()
 
 
1
+ from pdf2image import convert_from_path
2
+ from PIL import Image
3
+ import pytesseract
4
  import gradio as gr
 
5
 
6
+ def tesseract_ocr(filepath: str):
7
+ """
8
+ Perform OCR on the given image file and return the extracted text.
9
+ """
10
+ # Open the image file
11
+ with Image.open(filepath) as img:
12
+ # Use pytesseract to do OCR on the image
13
+ text = pytesseract.image_to_string(img)
14
+ return text
15
+
16
+ title = "Invoicer IA"
17
+ description = "extract data from invoice"
18
+ article = "This is a simple OCR application that extracts text from images using Tesseract OCR. You can upload an image of an invoice, and the application will return the extracted text."
19
+ examples = [["example_invoice.png"]]
20
+
21
+
22
+ with gr.Blocks(title=title) as demo:
23
+ gr.Markdown(f'<h1 style="text-align: center; margin-bottom: 1rem;">{title}</h1>')
24
+ gr.Markdown(description)
25
+ gr.Markdown(article)
26
+
27
+ with gr.Row():
28
+ with gr.Column():
29
+ image_input = gr.Image(type="filepath", label="Upload Invoice Image")
30
+ output_text = gr.Textbox(label="Extracted Text", lines=10, placeholder="Extracted text will appear here...")
31
+
32
+ with gr.Column():
33
+ submit_button = gr.Button("Submit")
34
+
35
 
36
+ submit_button.click(fn=tesseract_ocr, inputs=image_input, outputs=output_text)
37
 
38
+ if __name__ == "__main__":
39
+ demo.launch()
40
+
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tesseract-ocr-all
requirements.txt CHANGED
@@ -1,54 +1,3 @@
1
- aiofiles==23.2.1
2
- annotated-types==0.7.0
3
- anyio==4.9.0
4
- certifi==2025.1.31
5
- charset-normalizer==3.4.1
6
- click==8.1.8
7
- colorama==0.4.6
8
- fastapi==0.115.12
9
- ffmpy==0.5.0
10
- filelock==3.18.0
11
- fsspec==2025.3.2
12
- gradio==5.23.3
13
- gradio_client==1.8.0
14
- groovy==0.1.2
15
- h11==0.14.0
16
- httpcore==1.0.7
17
- httpx==0.28.1
18
- huggingface-hub==0.30.1
19
- idna==3.10
20
- Jinja2==3.1.6
21
- markdown-it-py==3.0.0
22
- MarkupSafe==3.0.2
23
- mdurl==0.1.2
24
- numpy==2.2.4
25
- orjson==3.10.16
26
- packaging==24.2
27
- pandas==2.2.3
28
- pillow==11.1.0
29
- pydantic==2.11.2
30
- pydantic_core==2.33.1
31
- pydub==0.25.1
32
- Pygments==2.19.1
33
- python-dateutil==2.9.0.post0
34
- python-multipart==0.0.20
35
- pytz==2025.2
36
- PyYAML==6.0.2
37
- requests==2.32.3
38
- rich==14.0.0
39
- ruff==0.11.4
40
- safehttpx==0.1.6
41
- semantic-version==2.10.0
42
- shellingham==1.5.4
43
- six==1.17.0
44
- sniffio==1.3.1
45
- starlette==0.46.1
46
- tomlkit==0.13.2
47
- tqdm==4.67.1
48
- typer==0.15.2
49
- typing-inspection==0.4.0
50
- typing_extensions==4.13.1
51
- tzdata==2025.2
52
- urllib3==2.3.0
53
- uvicorn==0.34.0
54
- websockets==15.0.1
 
1
+ gradio
2
+ pytesseract
3
+ pdf2image