Spaces:
Runtime error
Runtime error
Commit ·
141ce11
1
Parent(s): c6b6da5
add tesseract-ocr
Browse files- Dockerfile +1 -2
- __pycache__/__init__.cpython-310.pyc +0 -0
- app/__pycache__/main.cpython-310.pyc +0 -0
- app/__pycache__/ocr.cpython-310.pyc +0 -0
- app/main.py +26 -2
- app/models/flights.py +0 -0
- app/ocr.py +82 -1
- app/routers/flight.py +39 -0
- app/routers/webscrap.py +8 -0
- requirements.txt +3 -1
Dockerfile
CHANGED
|
@@ -2,11 +2,10 @@ FROM python:3.10
|
|
| 2 |
|
| 3 |
WORKDIR /code
|
| 4 |
# Install libgl1-mesa-glx
|
| 5 |
-
RUN apt-get update && apt-get install -y libgl1-mesa-glx
|
| 6 |
|
| 7 |
COPY ./requirements.txt /code/requirements.txt
|
| 8 |
|
| 9 |
-
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 10 |
|
| 11 |
COPY ./app /code/app
|
| 12 |
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /code
|
| 4 |
# Install libgl1-mesa-glx
|
| 5 |
+
RUN apt-get update && apt-get install -y libgl1-mesa-glx tesseract-ocr libtesseract-dev
|
| 6 |
|
| 7 |
COPY ./requirements.txt /code/requirements.txt
|
| 8 |
|
|
|
|
| 9 |
|
| 10 |
COPY ./app /code/app
|
| 11 |
|
__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (161 Bytes). View file
|
|
|
app/__pycache__/main.cpython-310.pyc
CHANGED
|
Binary files a/app/__pycache__/main.cpython-310.pyc and b/app/__pycache__/main.cpython-310.pyc differ
|
|
|
app/__pycache__/ocr.cpython-310.pyc
CHANGED
|
Binary files a/app/__pycache__/ocr.cpython-310.pyc and b/app/__pycache__/ocr.cpython-310.pyc differ
|
|
|
app/main.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
-
from fastapi import FastAPI, File, UploadFile
|
| 2 |
from fastapi.responses import JSONResponse
|
| 3 |
-
from app.ocr import easyocr_ocr
|
| 4 |
|
| 5 |
app = FastAPI()
|
| 6 |
|
|
|
|
| 7 |
@app.post("/upload/")
|
| 8 |
async def upload_image(file: UploadFile = File(...)):
|
| 9 |
try:
|
|
@@ -19,3 +20,26 @@ async def upload_image(file: UploadFile = File(...)):
|
|
| 19 |
|
| 20 |
except Exception as e:
|
| 21 |
return JSONResponse(content={"error": str(e)}, status_code=500)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 2 |
from fastapi.responses import JSONResponse
|
| 3 |
+
from app.ocr import easyocr_ocr, tesseract_ocr,tesseract_ocr_pdf
|
| 4 |
|
| 5 |
app = FastAPI()
|
| 6 |
|
| 7 |
+
|
| 8 |
@app.post("/upload/")
|
| 9 |
async def upload_image(file: UploadFile = File(...)):
|
| 10 |
try:
|
|
|
|
| 20 |
|
| 21 |
except Exception as e:
|
| 22 |
return JSONResponse(content={"error": str(e)}, status_code=500)
|
| 23 |
+
|
| 24 |
+
@app.post("/tesseract/extract")
|
| 25 |
+
async def upload_image(file: UploadFile = File(...)):
|
| 26 |
+
try:
|
| 27 |
+
# Save the uploaded file to a temporary location
|
| 28 |
+
contents = await file.read()
|
| 29 |
+
file_extension = file.filename.lower().split('.')[-1]
|
| 30 |
+
file_path = f"temp_{file.filename}"
|
| 31 |
+
print(file.filename.lower().endswith(('.pdf')))
|
| 32 |
+
with open(file_path, "wb") as f:
|
| 33 |
+
f.write(contents)
|
| 34 |
+
|
| 35 |
+
if file_extension == 'png' or file_extension == 'jpg' or file_extension == 'jpeg':
|
| 36 |
+
ocr_result = await tesseract_ocr(file_path)
|
| 37 |
+
elif file_extension == 'pdf':
|
| 38 |
+
ocr_result = await tesseract_ocr_pdf(file_path)
|
| 39 |
+
else:
|
| 40 |
+
raise HTTPException(status_code=400, detail="Unsupported file format")
|
| 41 |
+
# ocr_result = await tesseract_ocr(file_path)
|
| 42 |
+
return JSONResponse(content={"result": ocr_result})
|
| 43 |
+
return JSONResponse(content={"formatted_output": formatted_output})
|
| 44 |
+
except Exception as e:
|
| 45 |
+
return JSONResponse(content={"error": str(e)}, status_code=500)
|
app/models/flights.py
ADDED
|
File without changes
|
app/ocr.py
CHANGED
|
@@ -1,4 +1,17 @@
|
|
| 1 |
import easyocr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
def reformat_ocr_result(result):
|
| 4 |
mapping = {
|
|
@@ -48,9 +61,77 @@ def reformat_ocr_result(result):
|
|
| 48 |
print(formatted_output)
|
| 49 |
return formatted_output
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
def easyocr_ocr(image_path):
|
| 52 |
-
reader = easyocr.Reader(['
|
| 53 |
result = reader.readtext(image_path)
|
| 54 |
result_list = [f"{text} (confidence: {confidence:.4f})" for (bbox, text, confidence) in result]
|
| 55 |
formatted_output = reformat_ocr_result(result_list)
|
| 56 |
return formatted_output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import easyocr
|
| 2 |
+
import fitz
|
| 3 |
+
import pytesseract
|
| 4 |
+
from PIL import Image
|
| 5 |
+
import cv2
|
| 6 |
+
import io
|
| 7 |
+
import pymupdf
|
| 8 |
+
from fastapi import HTTPException
|
| 9 |
+
import numpy as np
|
| 10 |
+
from pytesseract import Output
|
| 11 |
+
import imutils
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
|
| 15 |
|
| 16 |
def reformat_ocr_result(result):
|
| 17 |
mapping = {
|
|
|
|
| 61 |
print(formatted_output)
|
| 62 |
return formatted_output
|
| 63 |
|
| 64 |
+
async def detect_rotation(image):
|
| 65 |
+
# load the input image, convert it from BGR to RGB channel ordering,
|
| 66 |
+
# and use Tesseract to determine the text orientation
|
| 67 |
+
image = cv2.imread(image)
|
| 68 |
+
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 69 |
+
results = pytesseract.image_to_osd(rgb, output_type=Output.DICT)
|
| 70 |
+
# display the orientation information
|
| 71 |
+
print("[INFO] detected orientation: {}".format(
|
| 72 |
+
results["orientation"]))
|
| 73 |
+
print("[INFO] rotate by {} degrees to correct".format(
|
| 74 |
+
results["rotate"]))
|
| 75 |
+
print("[INFO] detected script: {}".format(results["script"]))
|
| 76 |
+
# rotate the image to correct the orientation
|
| 77 |
+
rotated = imutils.rotate_bound(image, angle=results["rotate"])
|
| 78 |
+
return rotated
|
| 79 |
+
|
| 80 |
+
|
| 81 |
def easyocr_ocr(image_path):
|
| 82 |
+
reader = easyocr.Reader(['id'])
|
| 83 |
result = reader.readtext(image_path)
|
| 84 |
result_list = [f"{text} (confidence: {confidence:.4f})" for (bbox, text, confidence) in result]
|
| 85 |
formatted_output = reformat_ocr_result(result_list)
|
| 86 |
return formatted_output
|
| 87 |
+
|
| 88 |
+
# Function to convert PDF page to PIL image
|
| 89 |
+
def pdf_page_to_image(pdf_page):
|
| 90 |
+
# Convert PDF page to image using fitz and PIL
|
| 91 |
+
pix = pdf_page.get_pixmap()
|
| 92 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 93 |
+
return img
|
| 94 |
+
|
| 95 |
+
async def tesseract_ocr_pdf(pdf_path):
|
| 96 |
+
try:
|
| 97 |
+
pdf_document = fitz.open(pdf_path)
|
| 98 |
+
text = ""
|
| 99 |
+
|
| 100 |
+
for page_number in range(len(pdf_document)):
|
| 101 |
+
page = pdf_document.load_page(page_number)
|
| 102 |
+
image = pdf_page_to_image(page)
|
| 103 |
+
# Save the image to a temporary path
|
| 104 |
+
temp_image_path = f"temp_page_{page_number}.png"
|
| 105 |
+
|
| 106 |
+
image.save(temp_image_path)
|
| 107 |
+
# Perform OCR on the saved image
|
| 108 |
+
page_text = await tesseract_ocr(temp_image_path)
|
| 109 |
+
text += page_text + "\n"
|
| 110 |
+
|
| 111 |
+
pdf_document.close()
|
| 112 |
+
return text
|
| 113 |
+
except Exception as e:
|
| 114 |
+
print(f"Error opening PDF: {e}")
|
| 115 |
+
raise HTTPException(status_code=400, detail="Error processing PDF file")
|
| 116 |
+
|
| 117 |
+
async def tesseract_ocr(image_path):
|
| 118 |
+
# Grayscale, Gaussian blur, Otsu's threshold
|
| 119 |
+
image = await detect_rotation(image_path)
|
| 120 |
+
# image = cv2.imread(image_path)
|
| 121 |
+
|
| 122 |
+
# corrected_image = correct_image_rotation(image)
|
| 123 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 124 |
+
blur = cv2.GaussianBlur(gray, (3,3), 0)
|
| 125 |
+
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
| 126 |
+
|
| 127 |
+
# Morph open to remove noise and invert image
|
| 128 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
|
| 129 |
+
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
|
| 130 |
+
invert = 255 - opening
|
| 131 |
+
result = pytesseract.image_to_string( invert, config='--psm 6')
|
| 132 |
+
result = result.replace('\n', ' ').replace('\r', ' ').replace(' ', ' ')
|
| 133 |
+
|
| 134 |
+
return result
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
app/routers/flight.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
from typing import List
|
| 4 |
+
import requests
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
|
| 7 |
+
router = APIRouter()
|
| 8 |
+
|
| 9 |
+
# Example flight sites list
|
| 10 |
+
flight_sites = [
|
| 11 |
+
{"url": "https://example.com/flight1", "threshold": 200},
|
| 12 |
+
{"url": "https://example.com/flight2", "threshold": 300},
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
class Flight(BaseModel):
|
| 16 |
+
url: str
|
| 17 |
+
threshold: float
|
| 18 |
+
|
| 19 |
+
def check_price(url, threshold):
|
| 20 |
+
response = requests.get(url)
|
| 21 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 22 |
+
# Extracting price assuming a specific HTML class or tag
|
| 23 |
+
price = float(soup.find(class_="price").get_text().replace("$", ""))
|
| 24 |
+
return price
|
| 25 |
+
|
| 26 |
+
@router.get("/", response_model=List[Flight])
|
| 27 |
+
def read_flights():
|
| 28 |
+
return flight_sites
|
| 29 |
+
|
| 30 |
+
@router.post("/", response_model=Flight)
|
| 31 |
+
def add_flight(flight: Flight):
|
| 32 |
+
flight_sites.append(flight.dict())
|
| 33 |
+
return flight
|
| 34 |
+
|
| 35 |
+
@router.delete("/", response_model=Flight)
|
| 36 |
+
def remove_flight(url: str):
|
| 37 |
+
global flight_sites
|
| 38 |
+
flight_sites = [site for site in flight_sites if site["url"] != url]
|
| 39 |
+
return {"message": "Flight removed successfully", "url": url}
|
app/routers/webscrap.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter
|
| 2 |
+
from app.routers import base
|
| 3 |
+
|
| 4 |
+
router = APIRouter()
|
| 5 |
+
|
| 6 |
+
@router.get("/webscrap", response_model=base.Response)
|
| 7 |
+
async def webscrap():
|
| 8 |
+
return base.Response(status_code=200, detail="webscrap")
|
requirements.txt
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
fastapi
|
| 2 |
uvicorn
|
| 3 |
easyocr
|
| 4 |
-
pillow
|
|
|
|
|
|
|
|
|
| 1 |
fastapi
|
| 2 |
uvicorn
|
| 3 |
easyocr
|
| 4 |
+
pillow
|
| 5 |
+
pytesseract
|
| 6 |
+
PyMuPDF
|