Spaces:

harshvisualz
/

PeddleOCR

Build error

App Files Files Community

harshvisualz commited on Jun 3, 2025

Commit

274f3a2

1 Parent(s): 1f215d0

Add application file

Browse files

Files changed (4) hide show

.dockerignore +32 -0
app.py +2 -1
main.py +83 -0
render.yaml +9 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,32 @@

+# Ignore any folders or files you don't want in Docker image
+data/
+logs/
+tests/
+.env
+*.pyc
+__pycache__/
+node_modules/
+.git/
+.DS_Store
+*.md
+__pycache__/
+*.py[cod]
+*$py.class
+.env
+uploads/
+*.csv
+.DS_Store
+venv/
+.idea/
+*.log
+myenv/
+*.png
+*.jpg
+*.jpeg
+*.gif
+*.bmp
+*.tiff
+*.ico
+*.webp

app.py CHANGED Viewed

@@ -7,9 +7,10 @@ import pandas as pd
 from paddleocr import PaddleOCR
 from fastapi import FastAPI, UploadFile, File
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 app = FastAPI()
 origins = [

 from paddleocr import PaddleOCR
 from fastapi import FastAPI, UploadFile, File
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
+# Set PaddleOCR home directory to /tmp (or another writable directory)
+os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
 app = FastAPI()
 origins = [

main.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+from paddleocr import PaddleOCR
+# Initialize PaddleOCR
+# ocr = PaddleOCR(use_angle_cls=True, lang="en", det_db_box_thresh=0.5)
+ocr = PaddleOCR(use_angle_cls=True, lang='en')
+# Load Image
+image_path = "image.png"  # Replace with your vendor statement
+image = cv2.imread(image_path)
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+height, width, channels = image.shape
+# OCR Processing
+ocr_results = ocr.ocr(image_path)
+print(ocr_results)
+extracted_text = []
+page = ocr_results[0]
+for block in page:
+    print(block)
+# Lists of recognized texts and their bounding boxes
+texts = page['rec_texts']
+boxes = page['dt_polys']
+scores = page['rec_scores']
+print(texts)
+# Zip them together
+text_and_boxes = list(zip(texts, boxes, scores))
+# Display all results
+for text, box, score in text_and_boxes:
+    print(f"Text: {text}")
+    print(f"Bounding Box: {box.tolist()}")  # Convert numpy array to regular list
+    print(f"Score: {score}")
+    print("---")
+    extracted_text.append((text, score))
+# Print Extracted Text
+print("🔹 Extracted Text from Invoice:")
+for text, score in extracted_text:
+    print(f"{text} (Confidence: {score:.2f})")
+# Create a simple dataframe from all OCR text
+all_text = [text for text, _ in extracted_text]
+print("\n🔹 Creating a simple data structure from all OCR text")
+df = pd.DataFrame({'text': all_text})
+print(df.head())
+df.to_csv("invoice_extracted_text.csv", index=False)
+# Display Image with OCR Text Overlay
+plt.figure(figsize=(10, 10))
+plt.imshow(image)
+# Add text annotations
+for text, box, score in text_and_boxes:
+    # y_offset = int(0.03 * height)  # 5% downward shift
+    y_offset = 0
+    print(height)
+    corrected_box = [(x, y + y_offset) for (x, y) in box]
+    # Draw bounding box
+    plt.plot(
+        [corrected_box[0][0], corrected_box[1][0], corrected_box[2][0], corrected_box[3][0], corrected_box[0][0]],
+        [corrected_box[0][1], corrected_box[1][1], corrected_box[2][1], corrected_box[3][1], corrected_box[0][1]], 'r-'
+    )
+    # Add text annotation
+    csfont = {'fontname': 'Poppins'}
+    plt.text(corrected_box[0][0], corrected_box[0][1], text, color='blue', fontsize=8, **csfont)
+plt.axis("off")
+plt.tight_layout()
+plt.savefig("s3.png", bbox_inches='tight')
+plt.show()
+print("\n🔹 Processing complete! Annotated image and extracted data saved.")

render.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+services:
+  - type: web
+    name: paddleocr-api
+    env: python
+    buildCommand: pip install -r requirements.txt
+    startCommand: uvicorn app:app --host 0.0.0.0 --port 10000
+    envVars:
+      - key: PYTHON_VERSION
+        value: 3.9.0