Spaces:
Build error
Build error
| import cv2 | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| from paddleocr import PaddleOCR | |
| # Initialize PaddleOCR | |
| # ocr = PaddleOCR(use_angle_cls=True, lang="en", det_db_box_thresh=0.5) | |
| ocr = PaddleOCR(use_angle_cls=True, lang='en') | |
| # Load Image | |
| image_path = "image.png" # Replace with your vendor statement | |
| image = cv2.imread(image_path) | |
| image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| height, width, channels = image.shape | |
| # OCR Processing | |
| ocr_results = ocr.ocr(image_path) | |
| print(ocr_results) | |
| extracted_text = [] | |
| page = ocr_results[0] | |
| for block in page: | |
| print(block) | |
| # Lists of recognized texts and their bounding boxes | |
| texts = page['rec_texts'] | |
| boxes = page['dt_polys'] | |
| scores = page['rec_scores'] | |
| print(texts) | |
| # Zip them together | |
| text_and_boxes = list(zip(texts, boxes, scores)) | |
| # Display all results | |
| for text, box, score in text_and_boxes: | |
| print(f"Text: {text}") | |
| print(f"Bounding Box: {box.tolist()}") # Convert numpy array to regular list | |
| print(f"Score: {score}") | |
| print("---") | |
| extracted_text.append((text, score)) | |
| # Print Extracted Text | |
| print("🔹 Extracted Text from Invoice:") | |
| for text, score in extracted_text: | |
| print(f"{text} (Confidence: {score:.2f})") | |
| # Create a simple dataframe from all OCR text | |
| all_text = [text for text, _ in extracted_text] | |
| print("\n🔹 Creating a simple data structure from all OCR text") | |
| df = pd.DataFrame({'text': all_text}) | |
| print(df.head()) | |
| df.to_csv("invoice_extracted_text.csv", index=False) | |
| # Display Image with OCR Text Overlay | |
| plt.figure(figsize=(10, 10)) | |
| plt.imshow(image) | |
| # Add text annotations | |
| for text, box, score in text_and_boxes: | |
| # y_offset = int(0.03 * height) # 5% downward shift | |
| y_offset = 0 | |
| print(height) | |
| corrected_box = [(x, y + y_offset) for (x, y) in box] | |
| # Draw bounding box | |
| plt.plot( | |
| [corrected_box[0][0], corrected_box[1][0], corrected_box[2][0], corrected_box[3][0], corrected_box[0][0]], | |
| [corrected_box[0][1], corrected_box[1][1], corrected_box[2][1], corrected_box[3][1], corrected_box[0][1]], 'r-' | |
| ) | |
| # Add text annotation | |
| csfont = {'fontname': 'Poppins'} | |
| plt.text(corrected_box[0][0], corrected_box[0][1], text, color='blue', fontsize=8, **csfont) | |
| plt.axis("off") | |
| plt.tight_layout() | |
| plt.savefig("s3.png", bbox_inches='tight') | |
| plt.show() | |
| print("\n🔹 Processing complete! Annotated image and extracted data saved.") | |