Spaces:
Build error
Build error
Commit ·
274f3a2
1
Parent(s): 1f215d0
Add application file
Browse files- .dockerignore +32 -0
- app.py +2 -1
- main.py +83 -0
- render.yaml +9 -0
.dockerignore
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore any folders or files you don't want in Docker image
|
| 2 |
+
data/
|
| 3 |
+
logs/
|
| 4 |
+
tests/
|
| 5 |
+
.env
|
| 6 |
+
*.pyc
|
| 7 |
+
__pycache__/
|
| 8 |
+
node_modules/
|
| 9 |
+
.git/
|
| 10 |
+
.DS_Store
|
| 11 |
+
*.md
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
__pycache__/
|
| 15 |
+
*.py[cod]
|
| 16 |
+
*$py.class
|
| 17 |
+
.env
|
| 18 |
+
uploads/
|
| 19 |
+
*.csv
|
| 20 |
+
.DS_Store
|
| 21 |
+
venv/
|
| 22 |
+
.idea/
|
| 23 |
+
*.log
|
| 24 |
+
myenv/
|
| 25 |
+
*.png
|
| 26 |
+
*.jpg
|
| 27 |
+
*.jpeg
|
| 28 |
+
*.gif
|
| 29 |
+
*.bmp
|
| 30 |
+
*.tiff
|
| 31 |
+
*.ico
|
| 32 |
+
*.webp
|
app.py
CHANGED
|
@@ -7,9 +7,10 @@ import pandas as pd
|
|
| 7 |
from paddleocr import PaddleOCR
|
| 8 |
from fastapi import FastAPI, UploadFile, File
|
| 9 |
from fastapi.responses import FileResponse, JSONResponse
|
| 10 |
-
|
| 11 |
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
|
|
|
|
|
|
|
| 13 |
app = FastAPI()
|
| 14 |
|
| 15 |
origins = [
|
|
|
|
| 7 |
from paddleocr import PaddleOCR
|
| 8 |
from fastapi import FastAPI, UploadFile, File
|
| 9 |
from fastapi.responses import FileResponse, JSONResponse
|
|
|
|
| 10 |
from fastapi.middleware.cors import CORSMiddleware
|
| 11 |
|
| 12 |
+
# Set PaddleOCR home directory to /tmp (or another writable directory)
|
| 13 |
+
os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
|
| 14 |
app = FastAPI()
|
| 15 |
|
| 16 |
origins = [
|
main.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from paddleocr import PaddleOCR
|
| 6 |
+
|
| 7 |
+
# Initialize PaddleOCR
|
| 8 |
+
# ocr = PaddleOCR(use_angle_cls=True, lang="en", det_db_box_thresh=0.5)
|
| 9 |
+
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
| 10 |
+
|
| 11 |
+
# Load Image
|
| 12 |
+
image_path = "image.png" # Replace with your vendor statement
|
| 13 |
+
image = cv2.imread(image_path)
|
| 14 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 15 |
+
height, width, channels = image.shape
|
| 16 |
+
|
| 17 |
+
# OCR Processing
|
| 18 |
+
ocr_results = ocr.ocr(image_path)
|
| 19 |
+
print(ocr_results)
|
| 20 |
+
|
| 21 |
+
extracted_text = []
|
| 22 |
+
page = ocr_results[0]
|
| 23 |
+
for block in page:
|
| 24 |
+
print(block)
|
| 25 |
+
# Lists of recognized texts and their bounding boxes
|
| 26 |
+
texts = page['rec_texts']
|
| 27 |
+
boxes = page['dt_polys']
|
| 28 |
+
scores = page['rec_scores']
|
| 29 |
+
print(texts)
|
| 30 |
+
|
| 31 |
+
# Zip them together
|
| 32 |
+
text_and_boxes = list(zip(texts, boxes, scores))
|
| 33 |
+
|
| 34 |
+
# Display all results
|
| 35 |
+
for text, box, score in text_and_boxes:
|
| 36 |
+
print(f"Text: {text}")
|
| 37 |
+
print(f"Bounding Box: {box.tolist()}") # Convert numpy array to regular list
|
| 38 |
+
print(f"Score: {score}")
|
| 39 |
+
print("---")
|
| 40 |
+
extracted_text.append((text, score))
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Print Extracted Text
|
| 44 |
+
print("🔹 Extracted Text from Invoice:")
|
| 45 |
+
for text, score in extracted_text:
|
| 46 |
+
print(f"{text} (Confidence: {score:.2f})")
|
| 47 |
+
|
| 48 |
+
# Create a simple dataframe from all OCR text
|
| 49 |
+
all_text = [text for text, _ in extracted_text]
|
| 50 |
+
print("\n🔹 Creating a simple data structure from all OCR text")
|
| 51 |
+
df = pd.DataFrame({'text': all_text})
|
| 52 |
+
print(df.head())
|
| 53 |
+
df.to_csv("invoice_extracted_text.csv", index=False)
|
| 54 |
+
|
| 55 |
+
# Display Image with OCR Text Overlay
|
| 56 |
+
plt.figure(figsize=(10, 10))
|
| 57 |
+
plt.imshow(image)
|
| 58 |
+
# Add text annotations
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
for text, box, score in text_and_boxes:
|
| 62 |
+
|
| 63 |
+
# y_offset = int(0.03 * height) # 5% downward shift
|
| 64 |
+
y_offset = 0
|
| 65 |
+
print(height)
|
| 66 |
+
corrected_box = [(x, y + y_offset) for (x, y) in box]
|
| 67 |
+
|
| 68 |
+
# Draw bounding box
|
| 69 |
+
plt.plot(
|
| 70 |
+
[corrected_box[0][0], corrected_box[1][0], corrected_box[2][0], corrected_box[3][0], corrected_box[0][0]],
|
| 71 |
+
[corrected_box[0][1], corrected_box[1][1], corrected_box[2][1], corrected_box[3][1], corrected_box[0][1]], 'r-'
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Add text annotation
|
| 75 |
+
csfont = {'fontname': 'Poppins'}
|
| 76 |
+
plt.text(corrected_box[0][0], corrected_box[0][1], text, color='blue', fontsize=8, **csfont)
|
| 77 |
+
|
| 78 |
+
plt.axis("off")
|
| 79 |
+
plt.tight_layout()
|
| 80 |
+
plt.savefig("s3.png", bbox_inches='tight')
|
| 81 |
+
plt.show()
|
| 82 |
+
|
| 83 |
+
print("\n🔹 Processing complete! Annotated image and extracted data saved.")
|
render.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
- type: web
|
| 3 |
+
name: paddleocr-api
|
| 4 |
+
env: python
|
| 5 |
+
buildCommand: pip install -r requirements.txt
|
| 6 |
+
startCommand: uvicorn app:app --host 0.0.0.0 --port 10000
|
| 7 |
+
envVars:
|
| 8 |
+
- key: PYTHON_VERSION
|
| 9 |
+
value: 3.9.0
|