ariansyahdedy commited on
Commit
141ce11
·
1 Parent(s): c6b6da5

add tesseract-ocr

Browse files
Dockerfile CHANGED
@@ -2,11 +2,10 @@ FROM python:3.10
2
 
3
  WORKDIR /code
4
  # Install libgl1-mesa-glx
5
- RUN apt-get update && apt-get install -y libgl1-mesa-glx
6
 
7
  COPY ./requirements.txt /code/requirements.txt
8
 
9
- RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
10
 
11
  COPY ./app /code/app
12
 
 
2
 
3
  WORKDIR /code
4
  # Install libgl1-mesa-glx
5
+ RUN apt-get update && apt-get install -y libgl1-mesa-glx tesseract-ocr libtesseract-dev
6
 
7
  COPY ./requirements.txt /code/requirements.txt
8
 
 
9
 
10
  COPY ./app /code/app
11
 
__pycache__/__init__.cpython-310.pyc ADDED
Binary file (161 Bytes). View file
 
app/__pycache__/main.cpython-310.pyc CHANGED
Binary files a/app/__pycache__/main.cpython-310.pyc and b/app/__pycache__/main.cpython-310.pyc differ
 
app/__pycache__/ocr.cpython-310.pyc CHANGED
Binary files a/app/__pycache__/ocr.cpython-310.pyc and b/app/__pycache__/ocr.cpython-310.pyc differ
 
app/main.py CHANGED
@@ -1,9 +1,10 @@
1
- from fastapi import FastAPI, File, UploadFile
2
  from fastapi.responses import JSONResponse
3
- from app.ocr import easyocr_ocr
4
 
5
  app = FastAPI()
6
 
 
7
  @app.post("/upload/")
8
  async def upload_image(file: UploadFile = File(...)):
9
  try:
@@ -19,3 +20,26 @@ async def upload_image(file: UploadFile = File(...)):
19
 
20
  except Exception as e:
21
  return JSONResponse(content={"error": str(e)}, status_code=500)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
  from fastapi.responses import JSONResponse
3
+ from app.ocr import easyocr_ocr, tesseract_ocr,tesseract_ocr_pdf
4
 
5
  app = FastAPI()
6
 
7
+
8
  @app.post("/upload/")
9
  async def upload_image(file: UploadFile = File(...)):
10
  try:
 
20
 
21
  except Exception as e:
22
  return JSONResponse(content={"error": str(e)}, status_code=500)
23
+
24
+ @app.post("/tesseract/extract")
25
+ async def upload_image(file: UploadFile = File(...)):
26
+ try:
27
+ # Save the uploaded file to a temporary location
28
+ contents = await file.read()
29
+ file_extension = file.filename.lower().split('.')[-1]
30
+ file_path = f"temp_{file.filename}"
31
+ print(file.filename.lower().endswith(('.pdf')))
32
+ with open(file_path, "wb") as f:
33
+ f.write(contents)
34
+
35
+ if file_extension == 'png' or file_extension == 'jpg' or file_extension == 'jpeg':
36
+ ocr_result = await tesseract_ocr(file_path)
37
+ elif file_extension == 'pdf':
38
+ ocr_result = await tesseract_ocr_pdf(file_path)
39
+ else:
40
+ raise HTTPException(status_code=400, detail="Unsupported file format")
41
+ # ocr_result = await tesseract_ocr(file_path)
42
+ return JSONResponse(content={"result": ocr_result})
43
+ return JSONResponse(content={"formatted_output": formatted_output})
44
+ except Exception as e:
45
+ return JSONResponse(content={"error": str(e)}, status_code=500)
app/models/flights.py ADDED
File without changes
app/ocr.py CHANGED
@@ -1,4 +1,17 @@
1
  import easyocr
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  def reformat_ocr_result(result):
4
  mapping = {
@@ -48,9 +61,77 @@ def reformat_ocr_result(result):
48
  print(formatted_output)
49
  return formatted_output
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def easyocr_ocr(image_path):
52
- reader = easyocr.Reader(['en'])
53
  result = reader.readtext(image_path)
54
  result_list = [f"{text} (confidence: {confidence:.4f})" for (bbox, text, confidence) in result]
55
  formatted_output = reformat_ocr_result(result_list)
56
  return formatted_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import easyocr
2
+ import fitz
3
+ import pytesseract
4
+ from PIL import Image
5
+ import cv2
6
+ import io
7
+ import pymupdf
8
+ from fastapi import HTTPException
9
+ import numpy as np
10
+ from pytesseract import Output
11
+ import imutils
12
+
13
+
14
+ pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
15
 
16
  def reformat_ocr_result(result):
17
  mapping = {
 
61
  print(formatted_output)
62
  return formatted_output
63
 
64
+ async def detect_rotation(image):
65
+ # load the input image, convert it from BGR to RGB channel ordering,
66
+ # and use Tesseract to determine the text orientation
67
+ image = cv2.imread(image)
68
+ rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
69
+ results = pytesseract.image_to_osd(rgb, output_type=Output.DICT)
70
+ # display the orientation information
71
+ print("[INFO] detected orientation: {}".format(
72
+ results["orientation"]))
73
+ print("[INFO] rotate by {} degrees to correct".format(
74
+ results["rotate"]))
75
+ print("[INFO] detected script: {}".format(results["script"]))
76
+ # rotate the image to correct the orientation
77
+ rotated = imutils.rotate_bound(image, angle=results["rotate"])
78
+ return rotated
79
+
80
+
81
  def easyocr_ocr(image_path):
82
+ reader = easyocr.Reader(['id'])
83
  result = reader.readtext(image_path)
84
  result_list = [f"{text} (confidence: {confidence:.4f})" for (bbox, text, confidence) in result]
85
  formatted_output = reformat_ocr_result(result_list)
86
  return formatted_output
87
+
88
+ # Function to convert PDF page to PIL image
89
+ def pdf_page_to_image(pdf_page):
90
+ # Convert PDF page to image using fitz and PIL
91
+ pix = pdf_page.get_pixmap()
92
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
93
+ return img
94
+
95
+ async def tesseract_ocr_pdf(pdf_path):
96
+ try:
97
+ pdf_document = fitz.open(pdf_path)
98
+ text = ""
99
+
100
+ for page_number in range(len(pdf_document)):
101
+ page = pdf_document.load_page(page_number)
102
+ image = pdf_page_to_image(page)
103
+ # Save the image to a temporary path
104
+ temp_image_path = f"temp_page_{page_number}.png"
105
+
106
+ image.save(temp_image_path)
107
+ # Perform OCR on the saved image
108
+ page_text = await tesseract_ocr(temp_image_path)
109
+ text += page_text + "\n"
110
+
111
+ pdf_document.close()
112
+ return text
113
+ except Exception as e:
114
+ print(f"Error opening PDF: {e}")
115
+ raise HTTPException(status_code=400, detail="Error processing PDF file")
116
+
117
+ async def tesseract_ocr(image_path):
118
+ # Grayscale, Gaussian blur, Otsu's threshold
119
+ image = await detect_rotation(image_path)
120
+ # image = cv2.imread(image_path)
121
+
122
+ # corrected_image = correct_image_rotation(image)
123
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
124
+ blur = cv2.GaussianBlur(gray, (3,3), 0)
125
+ thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
126
+
127
+ # Morph open to remove noise and invert image
128
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
129
+ opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
130
+ invert = 255 - opening
131
+ result = pytesseract.image_to_string( invert, config='--psm 6')
132
+ result = result.replace('\n', ' ').replace('\r', ' ').replace(' ', ' ')
133
+
134
+ return result
135
+
136
+
137
+
app/routers/flight.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from pydantic import BaseModel
3
+ from typing import List
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+
7
+ router = APIRouter()
8
+
9
+ # Example flight sites list
10
+ flight_sites = [
11
+ {"url": "https://example.com/flight1", "threshold": 200},
12
+ {"url": "https://example.com/flight2", "threshold": 300},
13
+ ]
14
+
15
+ class Flight(BaseModel):
16
+ url: str
17
+ threshold: float
18
+
19
+ def check_price(url, threshold):
20
+ response = requests.get(url)
21
+ soup = BeautifulSoup(response.text, 'html.parser')
22
+ # Extracting price assuming a specific HTML class or tag
23
+ price = float(soup.find(class_="price").get_text().replace("$", ""))
24
+ return price
25
+
26
+ @router.get("/", response_model=List[Flight])
27
+ def read_flights():
28
+ return flight_sites
29
+
30
+ @router.post("/", response_model=Flight)
31
+ def add_flight(flight: Flight):
32
+ flight_sites.append(flight.dict())
33
+ return flight
34
+
35
+ @router.delete("/", response_model=Flight)
36
+ def remove_flight(url: str):
37
+ global flight_sites
38
+ flight_sites = [site for site in flight_sites if site["url"] != url]
39
+ return {"message": "Flight removed successfully", "url": url}
app/routers/webscrap.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from app.routers import base
3
+
4
+ router = APIRouter()
5
+
6
+ @router.get("/webscrap", response_model=base.Response)
7
+ async def webscrap():
8
+ return base.Response(status_code=200, detail="webscrap")
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
  fastapi
2
  uvicorn
3
  easyocr
4
- pillow
 
 
 
1
  fastapi
2
  uvicorn
3
  easyocr
4
+ pillow
5
+ pytesseract
6
+ PyMuPDF