Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -25,12 +25,12 @@ CORS(app, resources={r"/*": {"origins": ["http://localhost:*", "https://play.dev
|
|
| 25 |
process_status = {}
|
| 26 |
process_results = {}
|
| 27 |
app.config['file_path'] = None
|
|
|
|
| 28 |
|
| 29 |
data_ready = False # Flag to check if extraction is complete
|
| 30 |
lock = threading.Lock() # Lock to manage concurrent access
|
| 31 |
extracted_texts = {}
|
| 32 |
-
|
| 33 |
-
os.environ["HF_HOME"] = "/app/cache"
|
| 34 |
ocr_tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
|
| 35 |
ocr_model = AutoModel.from_pretrained(
|
| 36 |
'ucaslcl/GOT-OCR2_0', trust_remote_code=True,
|
|
@@ -45,8 +45,8 @@ class DynamicTableExtractor:
|
|
| 45 |
def __init__(self, pdf_bytes: bytes, output_folder: str):
|
| 46 |
self.pdf_bytes = pdf_bytes
|
| 47 |
self.images = convert_from_bytes(pdf_bytes)
|
| 48 |
-
self.output_folder = output_folder
|
| 49 |
-
os.makedirs(output_folder, exist_ok=True)
|
| 50 |
|
| 51 |
def detect_lines(self, img_array):
|
| 52 |
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
|
@@ -209,6 +209,7 @@ def extract_text_from_image(image_path):
|
|
| 209 |
return ocr_model.chat(ocr_tokenizer, image_path, ocr_type='ocr')
|
| 210 |
|
| 211 |
def save_text_pages_as_images(pdf_path, categorized_pages, output_dir="output_images"):
|
|
|
|
| 212 |
os.makedirs(output_dir, exist_ok=True)
|
| 213 |
text_only_pages = [page_num for page_num, category in categorized_pages.items() if category == "only text"]
|
| 214 |
extracted_texts = {}
|
|
@@ -220,6 +221,8 @@ def save_text_pages_as_images(pdf_path, categorized_pages, output_dir="output_im
|
|
| 220 |
return extracted_texts
|
| 221 |
|
| 222 |
def extract_text_from_table_pages(pdf_path, categorized_pages, output_folder="extracted_tables"):
|
|
|
|
|
|
|
| 223 |
extracted_texts = {}
|
| 224 |
table_pages = [page_num for page_num, category in categorized_pages.items() if category in ["only table", "text & table"]]
|
| 225 |
with open(pdf_path, "rb") as f:
|
|
@@ -278,7 +281,8 @@ def process_pdf(pdf_path, process_id):
|
|
| 278 |
extracted_texts = save_text_pages_as_images(pdf_path, categorized_pages)
|
| 279 |
table_texts = extract_text_from_table_pages(pdf_path, categorized_pages)
|
| 280 |
extracted_texts.update(table_texts)
|
| 281 |
-
temp_file_path =
|
|
|
|
| 282 |
filepath = save_extracted_text(extracted_texts, temp_file_path) # Save extracted text to file
|
| 283 |
app.config['file_path'] = filepath
|
| 284 |
process_status[process_id] = "completed"
|
|
@@ -298,7 +302,9 @@ def upload_pdf():
|
|
| 298 |
return jsonify({'error': 'No file provided'}), 400
|
| 299 |
|
| 300 |
file = request.files['file']
|
| 301 |
-
pdf_path = os.path.join("uploads", file.filename)
|
|
|
|
|
|
|
| 302 |
os.makedirs("uploads", exist_ok=True)
|
| 303 |
file.save(pdf_path)
|
| 304 |
process_id = str(uuid.uuid4())
|
|
|
|
| 25 |
process_status = {}
|
| 26 |
process_results = {}
|
| 27 |
app.config['file_path'] = None
|
| 28 |
+
TEMP_DIR = tempfile.mkdtemp()
|
| 29 |
|
| 30 |
data_ready = False # Flag to check if extraction is complete
|
| 31 |
lock = threading.Lock() # Lock to manage concurrent access
|
| 32 |
extracted_texts = {}
|
| 33 |
+
os.environ["HF_HOME"] = os.path.join(TEMP_DIR, "cache") #"/app/cache"
|
|
|
|
| 34 |
ocr_tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
|
| 35 |
ocr_model = AutoModel.from_pretrained(
|
| 36 |
'ucaslcl/GOT-OCR2_0', trust_remote_code=True,
|
|
|
|
| 45 |
def __init__(self, pdf_bytes: bytes, output_folder: str):
|
| 46 |
self.pdf_bytes = pdf_bytes
|
| 47 |
self.images = convert_from_bytes(pdf_bytes)
|
| 48 |
+
self.output_folder = os.path.join(TEMP_DIR, output_folder)
|
| 49 |
+
os.makedirs(self.output_folder, exist_ok=True)
|
| 50 |
|
| 51 |
def detect_lines(self, img_array):
|
| 52 |
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
|
|
|
| 209 |
return ocr_model.chat(ocr_tokenizer, image_path, ocr_type='ocr')
|
| 210 |
|
| 211 |
def save_text_pages_as_images(pdf_path, categorized_pages, output_dir="output_images"):
|
| 212 |
+
output_dir = os.path.join(TEMP_DIR, output_dir)
|
| 213 |
os.makedirs(output_dir, exist_ok=True)
|
| 214 |
text_only_pages = [page_num for page_num, category in categorized_pages.items() if category == "only text"]
|
| 215 |
extracted_texts = {}
|
|
|
|
| 221 |
return extracted_texts
|
| 222 |
|
| 223 |
def extract_text_from_table_pages(pdf_path, categorized_pages, output_folder="extracted_tables"):
|
| 224 |
+
output_folder = os.path.join(TEMP_DIR, output_folder)
|
| 225 |
+
os.makedirs(output_folder, exist_ok=True)
|
| 226 |
extracted_texts = {}
|
| 227 |
table_pages = [page_num for page_num, category in categorized_pages.items() if category in ["only table", "text & table"]]
|
| 228 |
with open(pdf_path, "rb") as f:
|
|
|
|
| 281 |
extracted_texts = save_text_pages_as_images(pdf_path, categorized_pages)
|
| 282 |
table_texts = extract_text_from_table_pages(pdf_path, categorized_pages)
|
| 283 |
extracted_texts.update(table_texts)
|
| 284 |
+
temp_file_path = os.path.join(TEMP_DIR, f"extracted_{process_id}.txt")
|
| 285 |
+
# temp_file_path = tempfile.mktemp(suffix='.txt')
|
| 286 |
filepath = save_extracted_text(extracted_texts, temp_file_path) # Save extracted text to file
|
| 287 |
app.config['file_path'] = filepath
|
| 288 |
process_status[process_id] = "completed"
|
|
|
|
| 302 |
return jsonify({'error': 'No file provided'}), 400
|
| 303 |
|
| 304 |
file = request.files['file']
|
| 305 |
+
pdf_path = os.path.join(TEMP_DIR, "uploads", file.filename)
|
| 306 |
+
os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
|
| 307 |
+
# pdf_path = os.path.join("uploads", file.filename)
|
| 308 |
os.makedirs("uploads", exist_ok=True)
|
| 309 |
file.save(pdf_path)
|
| 310 |
process_id = str(uuid.uuid4())
|