Spaces:

Spanicin
/

pdfextraction

Paused

App Files Files Community

Spanicin commited on Feb 14, 2025

Commit

01480ea

verified ·

1 Parent(s): 4256658

Create app.py

Browse files

Files changed (1) hide show

app.py +362 -0

app.py ADDED Viewed

	@@ -0,0 +1,362 @@

+from pdf2image import convert_from_bytes, convert_from_path
+from PIL import Image
+import numpy as np
+import cv2
+import pdfplumber
+from transformers import AutoModel, AutoTokenizer
+import io
+import os
+from PyPDF2 import PdfReader, PdfWriter
+from langchain_openai import ChatOpenAI
+from flask import Flask, request, jsonify, send_file
+from flask_cors import CORS
+import threading
+import time
+import uuid
+import tempfile
+import pytesseract
+pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
+app = Flask(__name__, template_folder='templates')
+CORS(app, resources={r"/*": {"origins": ["http://localhost:*", "https://play.dev.ryzeai.ai", "https://ryze2ui.dev.ryzeai.ai"]}})
+# Store process status and results
+process_status = {}
+process_results = {}
+app.config['file_path'] = None
+data_ready = False  # Flag to check if extraction is complete
+lock = threading.Lock()  # Lock to manage concurrent access
+extracted_texts = {}
+ocr_tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
+ocr_model = AutoModel.from_pretrained(
+    'ucaslcl/GOT-OCR2_0', trust_remote_code=True,
+    low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True
+).eval().cuda()
+API_KEY = "sk-8754"
+BASE_URL = "https://aura.dev.ryzeai.ai"
+llm = ChatOpenAI(temperature=0, model_name="azure/gpt-4o-mini", api_key=API_KEY, base_url=BASE_URL)
+class DynamicTableExtractor:
+    def __init__(self, pdf_bytes: bytes, output_folder: str):
+        self.pdf_bytes = pdf_bytes
+        self.images = convert_from_bytes(pdf_bytes)
+        self.output_folder = output_folder
+        os.makedirs(output_folder, exist_ok=True)
+    def detect_lines(self, img_array):
+        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
+        horizontal_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel)
+        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
+        vertical_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel)
+        return horizontal_lines, vertical_lines
+    def find_table_boundaries(self, horizontal_lines: np.ndarray, vertical_lines):
+        combined = cv2.addWeighted(horizontal_lines, 1, vertical_lines, 1, 0)
+        contours, _ = cv2.findContours(combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        table_boundaries = []
+        min_table_area = 5000
+        for contour in contours:
+            x, y, w, h = cv2.boundingRect(contour)
+            area = w * h
+            if area > min_table_area:
+                padding = 5
+                table_boundaries.append((
+                    max(0, x - padding),
+                    max(0, y - padding),
+                    x + w + padding,
+                    y + h + padding
+                ))
+        return table_boundaries
+    def detect_tables_by_text_alignment(self, img):
+        text_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
+        rows = {}
+        for i, text in enumerate(text_data['text']):
+            if text.strip():
+                y = text_data['top'][i]
+                if y not in rows:
+                    rows[y] = []
+                rows[y].append({
+                    'text': text,
+                    'left': text_data['left'][i],
+                    'width': text_data['width'][i],
+                    'height': text_data['height'][i],
+                    'conf': text_data['conf'][i]
+                })
+        table_regions = []
+        current_region = None
+        last_y = None
+        for y, row_texts in sorted(rows.items()):
+            is_tabular = (
+                len(row_texts) >= 3 and
+                any(t['text'].replace('.', '').replace('-', '').replace('$', '').isdigit()
+                    for t in row_texts)
+            )
+            if is_tabular:
+                if current_region and last_y and y - last_y > 50:
+                    current_region['bottom'] = last_y + row_texts[0]['height']
+                    table_regions.append(current_region)
+                    current_region = None
+                if current_region is None:
+                    current_region = {
+                        'top': y,
+                        'left': min(t['left'] for t in row_texts),
+                        'right': max(t['left'] + t['width'] for t in row_texts)
+                    }
+                else:
+                    current_region['right'] = max(
+                        current_region['right'],
+                        max(t['left'] + t['width'] for t in row_texts)
+                    )
+                last_y = y
+            elif current_region is not None:
+                current_region['bottom'] = y
+                table_regions.append(current_region)
+                current_region = None
+        if current_region:
+            current_region['bottom'] = last_y + 20
+            table_regions.append(current_region)
+        return table_regions
+    def merge_boundaries(self, boundaries):
+        if not boundaries:
+            return []
+        def overlap_or_nearby(b1, b2, threshold=20):
+            return not (b1[2] + threshold < b2[0] or b2[2] + threshold < b1[0] or
+                        b1[3] + threshold < b2[1] or b2[3] + threshold < b1[1])
+        merged = []
+        boundaries = sorted(boundaries, key=lambda x: (x[1], x[0]))
+        current = list(boundaries[0])
+        for next_bound in boundaries[1:]:
+            if overlap_or_nearby(current, next_bound):
+                current[0] = min(current[0], next_bound[0])
+                current[1] = min(current[1], next_bound[1])
+                current[2] = max(current[2], next_bound[2])
+                current[3] = max(current[3], next_bound[3])
+            else:
+                merged.append(tuple(current))
+                current = list(next_bound)
+        merged.append(tuple(current))
+        return merged
+    def remove_tables_from_image(self, img, table_boundaries):
+      img_array = np.array(img)
+      for x1, y1, x2, y2 in table_boundaries:
+          img_array[y1:y2, x1:x2] = 255  # Fill table area with white
+      return Image.fromarray(img_array)
+    def extract_tables(self) -> None:
+        for page_num, page_img in enumerate(self.images, start=1):
+            img_array = np.array(page_img)
+            horizontal_lines, vertical_lines = self.detect_lines(img_array)
+            line_based_boundaries = self.find_table_boundaries(horizontal_lines, vertical_lines)
+            text_based_regions = self.detect_tables_by_text_alignment(page_img)
+            text_based_boundaries = [
+                (r['left'], r['top'], r['right'], r['bottom'])
+                for r in text_based_regions
+            ]
+            all_boundaries = self.merge_boundaries(line_based_boundaries + text_based_boundaries)
+            cleaned_image = self.remove_tables_from_image(page_img, all_boundaries)
+            cleaned_output_path = os.path.join(self.output_folder, f'cleaned_page{page_num}.png')
+            cleaned_image.save(cleaned_output_path)
+            table_count = 0
+            for bounds in all_boundaries:
+                table_region = page_img.crop(bounds)
+                gray_table = cv2.cvtColor(np.array(table_region), cv2.COLOR_RGB2GRAY)
+                text = pytesseract.image_to_string(gray_table).strip()
+                if text:
+                    table_count += 1
+                    output_path = os.path.join(self.output_folder, f'page{page_num}_table{table_count}.png')
+                    table_region.save(output_path)
+def categorize_pdf_pages(pdf_path):
+    page_categories = {}
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_number, page in enumerate(pdf.pages):
+            text = page.extract_text()
+            tables = page.extract_tables()
+            page_categories[page_number] = "text & table" if tables and text else "only table" if tables else "only text" if text else "empty"
+    return page_categories
+def extract_text_from_image(image_path):
+    return ocr_model.chat(ocr_tokenizer, image_path, ocr_type='ocr')
+def save_text_pages_as_images(pdf_path, categorized_pages, output_dir="output_images"):
+    os.makedirs(output_dir, exist_ok=True)
+    text_only_pages = [page_num for page_num, category in categorized_pages.items() if category == "only text"]
+    extracted_texts = {}
+    images = convert_from_path(pdf_path, dpi=300)
+    for page_num in text_only_pages:
+        image_path = f"{output_dir}/page_{page_num+1}.png"
+        images[page_num].save(image_path, 'PNG')
+        extracted_texts[page_num + 1] = extract_text_from_image(image_path)
+    return extracted_texts
+def extract_text_from_table_pages(pdf_path, categorized_pages, output_folder="extracted_tables"):
+    extracted_texts = {}
+    table_pages = [page_num for page_num, category in categorized_pages.items() if category in ["only table", "text & table"]]
+    with open(pdf_path, "rb") as f:
+        pdf_reader = PdfReader(f)
+        for page_num in table_pages:
+            pdf_writer = PdfWriter()
+            pdf_writer.add_page(pdf_reader.pages[page_num])
+            pdf_bytes_io = io.BytesIO()
+            pdf_writer.write(pdf_bytes_io)
+            pdf_bytes = pdf_bytes_io.getvalue()
+            extractor = DynamicTableExtractor(pdf_bytes, output_folder)
+            extractor.extract_tables()
+            saved_images = sorted(os.listdir(output_folder))
+            page_images = [img for img in saved_images if img.endswith('.png')]
+            page_texts = [extract_text_from_image(os.path.join(output_folder, img)) for img in page_images]
+            if page_texts:
+                extracted_texts[page_num] = "\n".join(page_texts)
+    return extracted_texts
+# @app.route('/upload', methods=['POST'])
+# def extract_from_pdf():
+#     global extracted_texts
+#     if 'file' not in request.files:
+#         return jsonify({'error': 'No file provided'}), 400
+#     file = request.files['file']
+#     pdf_path = os.path.join("uploads", file.filename)
+#     os.makedirs("uploads", exist_ok=True)
+#     file.save(pdf_path)
+#     categorized_pages = categorize_pdf_pages(pdf_path)
+#     extracted_texts = save_text_pages_as_images(pdf_path, categorized_pages)
+#     table_texts = extract_text_from_table_pages(pdf_path, categorized_pages)
+#     extracted_texts.update(table_texts)
+#     return jsonify({'message': 'Extraction completed', 'data': extracted_texts})
+# @app.route('/query', methods=['POST'])
+# def query_extracted_data():
+#     global extracted_texts
+#     user_input = request.form['user_question']
+#     response = llm.invoke(str(extracted_texts) + " " + user_input)
+#     return jsonify({'response': response.content.strip()})
+def save_extracted_text(text_dict, filepath):
+    with open(filepath, "w", encoding="utf-8") as f:  # Open in text mode
+        for page, text in text_dict.items():
+            f.write(f"Page {page}:\n{text}\n\n")
+    return filepath
+def process_pdf(pdf_path, process_id):
+    global extracted_texts, data_ready
+    with lock:
+        data_ready = False  # Reset flag when new process starts
+    process_status[process_id] = "in_progress"
+    categorized_pages = categorize_pdf_pages(pdf_path)
+    extracted_texts = save_text_pages_as_images(pdf_path, categorized_pages)
+    table_texts = extract_text_from_table_pages(pdf_path, categorized_pages)
+    extracted_texts.update(table_texts)
+    temp_file_path = tempfile.mktemp(suffix='.txt')
+    filepath = save_extracted_text(extracted_texts, temp_file_path)  # Save extracted text to file
+    app.config['file_path'] = filepath
+    process_status[process_id] = "completed"
+    process_results[process_id] = {
+            "response": extracted_texts,
+            }
+    with lock:
+        data_ready = True  # Mark extraction as complete
+@app.route('/upload', methods=['POST'])
+def upload_pdf():
+    global extracted_texts, data_ready
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file provided'}), 400
+    file = request.files['file']
+    pdf_path = os.path.join("uploads", file.filename)
+    os.makedirs("uploads", exist_ok=True)
+    file.save(pdf_path)
+    process_id = str(uuid.uuid4())
+    thread = threading.Thread(target=process_pdf, args=(pdf_path, process_id))
+    thread.start()  # Start extraction in a separate thread
+    return jsonify({'message': 'File uploaded, extraction in progress', "process_id": process_id})
+@app.route('/status', methods=['GET'])
+def check_task_status():
+    process_id = request.args.get('process_id', None)
+    if process_id not in process_status:
+        return jsonify({"error": "Invalid process ID"}), 400
+    status = process_status[process_id]
+    if status == "completed":
+        result = process_results[process_id]
+        response = result["response"]
+        return jsonify({
+            "status": "completed",
+            "response": response,
+            "url": f"/download?file_path={app.config['file_path']}"
+        }), 200
+    elif status == "in_progress":
+        return jsonify({"status": "in_progress"}), 200
+    elif status == "error":
+        return jsonify({"status": "error", "error": process_results[process_id]["error"]}), 500
+@app.route('/query', methods=['POST'])
+def query_extracted_data():
+    process_id = request.args.get('process_id')
+    result = process_results[process_id]
+    text = result["response"]
+    user_input = request.form['user_question']
+    response = llm.invoke(str(text) + " " + user_input)
+    return jsonify({'response': response.content.strip()})
+@app.route("/download")
+def download_file():
+    file_path = app.config.get('file_path')
+    if file_path:
+        return send_file(file_path, as_attachment=True)
+    else:
+        return jsonify({"message": "File path is missing."}), 404
+if __name__ == '__main__':
+    app.run(debug=False)
+    # Start Ngrok in a separate thread
+    # def start_ngrok():
+    #     public_url = ngrok.connect(8000)
+    #     print(f"Ngrok public URL: {public_url}")
+    # ngrok_thread = Thread(target=start_ngrok)
+    # ngrok_thread.start()
+    # # Run Flask app
+    # app.run(port=8000)