Spaces:
Paused
Paused
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pdf2image import convert_from_bytes, convert_from_path
|
| 2 |
+
from PIL import Image
|
| 3 |
+
import numpy as np
|
| 4 |
+
import cv2
|
| 5 |
+
import pdfplumber
|
| 6 |
+
from transformers import AutoModel, AutoTokenizer
|
| 7 |
+
import io
|
| 8 |
+
import os
|
| 9 |
+
from PyPDF2 import PdfReader, PdfWriter
|
| 10 |
+
from langchain_openai import ChatOpenAI
|
| 11 |
+
from flask import Flask, request, jsonify, send_file
|
| 12 |
+
from flask_cors import CORS
|
| 13 |
+
import threading
|
| 14 |
+
import time
|
| 15 |
+
import uuid
|
| 16 |
+
import tempfile
|
| 17 |
+
import pytesseract
|
| 18 |
+
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
| 19 |
+
|
| 20 |
+
app = Flask(__name__, template_folder='templates')
|
| 21 |
+
CORS(app, resources={r"/*": {"origins": ["http://localhost:*", "https://play.dev.ryzeai.ai", "https://ryze2ui.dev.ryzeai.ai"]}})
|
| 22 |
+
|
| 23 |
+
# Store process status and results
|
| 24 |
+
process_status = {}
|
| 25 |
+
process_results = {}
|
| 26 |
+
app.config['file_path'] = None
|
| 27 |
+
|
| 28 |
+
data_ready = False # Flag to check if extraction is complete
|
| 29 |
+
lock = threading.Lock() # Lock to manage concurrent access
|
| 30 |
+
extracted_texts = {}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
ocr_tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
|
| 34 |
+
ocr_model = AutoModel.from_pretrained(
|
| 35 |
+
'ucaslcl/GOT-OCR2_0', trust_remote_code=True,
|
| 36 |
+
low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True
|
| 37 |
+
).eval().cuda()
|
| 38 |
+
|
| 39 |
+
API_KEY = "sk-8754"
|
| 40 |
+
BASE_URL = "https://aura.dev.ryzeai.ai"
|
| 41 |
+
llm = ChatOpenAI(temperature=0, model_name="azure/gpt-4o-mini", api_key=API_KEY, base_url=BASE_URL)
|
| 42 |
+
|
| 43 |
+
class DynamicTableExtractor:
|
| 44 |
+
def __init__(self, pdf_bytes: bytes, output_folder: str):
|
| 45 |
+
self.pdf_bytes = pdf_bytes
|
| 46 |
+
self.images = convert_from_bytes(pdf_bytes)
|
| 47 |
+
self.output_folder = output_folder
|
| 48 |
+
os.makedirs(output_folder, exist_ok=True)
|
| 49 |
+
|
| 50 |
+
def detect_lines(self, img_array):
|
| 51 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 52 |
+
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
| 53 |
+
|
| 54 |
+
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
|
| 55 |
+
horizontal_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel)
|
| 56 |
+
|
| 57 |
+
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
|
| 58 |
+
vertical_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel)
|
| 59 |
+
|
| 60 |
+
return horizontal_lines, vertical_lines
|
| 61 |
+
|
| 62 |
+
def find_table_boundaries(self, horizontal_lines: np.ndarray, vertical_lines):
|
| 63 |
+
combined = cv2.addWeighted(horizontal_lines, 1, vertical_lines, 1, 0)
|
| 64 |
+
contours, _ = cv2.findContours(combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 65 |
+
table_boundaries = []
|
| 66 |
+
min_table_area = 5000
|
| 67 |
+
|
| 68 |
+
for contour in contours:
|
| 69 |
+
x, y, w, h = cv2.boundingRect(contour)
|
| 70 |
+
area = w * h
|
| 71 |
+
if area > min_table_area:
|
| 72 |
+
padding = 5
|
| 73 |
+
table_boundaries.append((
|
| 74 |
+
max(0, x - padding),
|
| 75 |
+
max(0, y - padding),
|
| 76 |
+
x + w + padding,
|
| 77 |
+
y + h + padding
|
| 78 |
+
))
|
| 79 |
+
return table_boundaries
|
| 80 |
+
|
| 81 |
+
def detect_tables_by_text_alignment(self, img):
|
| 82 |
+
text_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
|
| 83 |
+
rows = {}
|
| 84 |
+
|
| 85 |
+
for i, text in enumerate(text_data['text']):
|
| 86 |
+
if text.strip():
|
| 87 |
+
y = text_data['top'][i]
|
| 88 |
+
if y not in rows:
|
| 89 |
+
rows[y] = []
|
| 90 |
+
rows[y].append({
|
| 91 |
+
'text': text,
|
| 92 |
+
'left': text_data['left'][i],
|
| 93 |
+
'width': text_data['width'][i],
|
| 94 |
+
'height': text_data['height'][i],
|
| 95 |
+
'conf': text_data['conf'][i]
|
| 96 |
+
})
|
| 97 |
+
|
| 98 |
+
table_regions = []
|
| 99 |
+
current_region = None
|
| 100 |
+
last_y = None
|
| 101 |
+
|
| 102 |
+
for y, row_texts in sorted(rows.items()):
|
| 103 |
+
is_tabular = (
|
| 104 |
+
len(row_texts) >= 3 and
|
| 105 |
+
any(t['text'].replace('.', '').replace('-', '').replace('$', '').isdigit()
|
| 106 |
+
for t in row_texts)
|
| 107 |
+
)
|
| 108 |
+
if is_tabular:
|
| 109 |
+
if current_region and last_y and y - last_y > 50:
|
| 110 |
+
current_region['bottom'] = last_y + row_texts[0]['height']
|
| 111 |
+
table_regions.append(current_region)
|
| 112 |
+
current_region = None
|
| 113 |
+
if current_region is None:
|
| 114 |
+
current_region = {
|
| 115 |
+
'top': y,
|
| 116 |
+
'left': min(t['left'] for t in row_texts),
|
| 117 |
+
'right': max(t['left'] + t['width'] for t in row_texts)
|
| 118 |
+
}
|
| 119 |
+
else:
|
| 120 |
+
current_region['right'] = max(
|
| 121 |
+
current_region['right'],
|
| 122 |
+
max(t['left'] + t['width'] for t in row_texts)
|
| 123 |
+
)
|
| 124 |
+
last_y = y
|
| 125 |
+
elif current_region is not None:
|
| 126 |
+
current_region['bottom'] = y
|
| 127 |
+
table_regions.append(current_region)
|
| 128 |
+
current_region = None
|
| 129 |
+
|
| 130 |
+
if current_region:
|
| 131 |
+
current_region['bottom'] = last_y + 20
|
| 132 |
+
table_regions.append(current_region)
|
| 133 |
+
|
| 134 |
+
return table_regions
|
| 135 |
+
|
| 136 |
+
def merge_boundaries(self, boundaries):
|
| 137 |
+
if not boundaries:
|
| 138 |
+
return []
|
| 139 |
+
|
| 140 |
+
def overlap_or_nearby(b1, b2, threshold=20):
|
| 141 |
+
return not (b1[2] + threshold < b2[0] or b2[2] + threshold < b1[0] or
|
| 142 |
+
b1[3] + threshold < b2[1] or b2[3] + threshold < b1[1])
|
| 143 |
+
|
| 144 |
+
merged = []
|
| 145 |
+
boundaries = sorted(boundaries, key=lambda x: (x[1], x[0]))
|
| 146 |
+
current = list(boundaries[0])
|
| 147 |
+
|
| 148 |
+
for next_bound in boundaries[1:]:
|
| 149 |
+
if overlap_or_nearby(current, next_bound):
|
| 150 |
+
current[0] = min(current[0], next_bound[0])
|
| 151 |
+
current[1] = min(current[1], next_bound[1])
|
| 152 |
+
current[2] = max(current[2], next_bound[2])
|
| 153 |
+
current[3] = max(current[3], next_bound[3])
|
| 154 |
+
else:
|
| 155 |
+
merged.append(tuple(current))
|
| 156 |
+
current = list(next_bound)
|
| 157 |
+
|
| 158 |
+
merged.append(tuple(current))
|
| 159 |
+
return merged
|
| 160 |
+
|
| 161 |
+
def remove_tables_from_image(self, img, table_boundaries):
|
| 162 |
+
img_array = np.array(img)
|
| 163 |
+
|
| 164 |
+
for x1, y1, x2, y2 in table_boundaries:
|
| 165 |
+
img_array[y1:y2, x1:x2] = 255 # Fill table area with white
|
| 166 |
+
|
| 167 |
+
return Image.fromarray(img_array)
|
| 168 |
+
|
| 169 |
+
def extract_tables(self) -> None:
|
| 170 |
+
for page_num, page_img in enumerate(self.images, start=1):
|
| 171 |
+
img_array = np.array(page_img)
|
| 172 |
+
horizontal_lines, vertical_lines = self.detect_lines(img_array)
|
| 173 |
+
|
| 174 |
+
line_based_boundaries = self.find_table_boundaries(horizontal_lines, vertical_lines)
|
| 175 |
+
text_based_regions = self.detect_tables_by_text_alignment(page_img)
|
| 176 |
+
text_based_boundaries = [
|
| 177 |
+
(r['left'], r['top'], r['right'], r['bottom'])
|
| 178 |
+
for r in text_based_regions
|
| 179 |
+
]
|
| 180 |
+
|
| 181 |
+
all_boundaries = self.merge_boundaries(line_based_boundaries + text_based_boundaries)
|
| 182 |
+
cleaned_image = self.remove_tables_from_image(page_img, all_boundaries)
|
| 183 |
+
cleaned_output_path = os.path.join(self.output_folder, f'cleaned_page{page_num}.png')
|
| 184 |
+
cleaned_image.save(cleaned_output_path)
|
| 185 |
+
|
| 186 |
+
table_count = 0
|
| 187 |
+
|
| 188 |
+
for bounds in all_boundaries:
|
| 189 |
+
table_region = page_img.crop(bounds)
|
| 190 |
+
gray_table = cv2.cvtColor(np.array(table_region), cv2.COLOR_RGB2GRAY)
|
| 191 |
+
text = pytesseract.image_to_string(gray_table).strip()
|
| 192 |
+
|
| 193 |
+
if text:
|
| 194 |
+
table_count += 1
|
| 195 |
+
output_path = os.path.join(self.output_folder, f'page{page_num}_table{table_count}.png')
|
| 196 |
+
table_region.save(output_path)
|
| 197 |
+
|
| 198 |
+
def categorize_pdf_pages(pdf_path):
|
| 199 |
+
page_categories = {}
|
| 200 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 201 |
+
for page_number, page in enumerate(pdf.pages):
|
| 202 |
+
text = page.extract_text()
|
| 203 |
+
tables = page.extract_tables()
|
| 204 |
+
page_categories[page_number] = "text & table" if tables and text else "only table" if tables else "only text" if text else "empty"
|
| 205 |
+
return page_categories
|
| 206 |
+
|
| 207 |
+
def extract_text_from_image(image_path):
|
| 208 |
+
return ocr_model.chat(ocr_tokenizer, image_path, ocr_type='ocr')
|
| 209 |
+
|
| 210 |
+
def save_text_pages_as_images(pdf_path, categorized_pages, output_dir="output_images"):
|
| 211 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 212 |
+
text_only_pages = [page_num for page_num, category in categorized_pages.items() if category == "only text"]
|
| 213 |
+
extracted_texts = {}
|
| 214 |
+
images = convert_from_path(pdf_path, dpi=300)
|
| 215 |
+
for page_num in text_only_pages:
|
| 216 |
+
image_path = f"{output_dir}/page_{page_num+1}.png"
|
| 217 |
+
images[page_num].save(image_path, 'PNG')
|
| 218 |
+
extracted_texts[page_num + 1] = extract_text_from_image(image_path)
|
| 219 |
+
return extracted_texts
|
| 220 |
+
|
| 221 |
+
def extract_text_from_table_pages(pdf_path, categorized_pages, output_folder="extracted_tables"):
|
| 222 |
+
extracted_texts = {}
|
| 223 |
+
table_pages = [page_num for page_num, category in categorized_pages.items() if category in ["only table", "text & table"]]
|
| 224 |
+
with open(pdf_path, "rb") as f:
|
| 225 |
+
pdf_reader = PdfReader(f)
|
| 226 |
+
for page_num in table_pages:
|
| 227 |
+
pdf_writer = PdfWriter()
|
| 228 |
+
pdf_writer.add_page(pdf_reader.pages[page_num])
|
| 229 |
+
pdf_bytes_io = io.BytesIO()
|
| 230 |
+
pdf_writer.write(pdf_bytes_io)
|
| 231 |
+
pdf_bytes = pdf_bytes_io.getvalue()
|
| 232 |
+
extractor = DynamicTableExtractor(pdf_bytes, output_folder)
|
| 233 |
+
extractor.extract_tables()
|
| 234 |
+
saved_images = sorted(os.listdir(output_folder))
|
| 235 |
+
page_images = [img for img in saved_images if img.endswith('.png')]
|
| 236 |
+
page_texts = [extract_text_from_image(os.path.join(output_folder, img)) for img in page_images]
|
| 237 |
+
if page_texts:
|
| 238 |
+
extracted_texts[page_num] = "\n".join(page_texts)
|
| 239 |
+
return extracted_texts
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# @app.route('/upload', methods=['POST'])
|
| 243 |
+
# def extract_from_pdf():
|
| 244 |
+
# global extracted_texts
|
| 245 |
+
# if 'file' not in request.files:
|
| 246 |
+
# return jsonify({'error': 'No file provided'}), 400
|
| 247 |
+
# file = request.files['file']
|
| 248 |
+
# pdf_path = os.path.join("uploads", file.filename)
|
| 249 |
+
# os.makedirs("uploads", exist_ok=True)
|
| 250 |
+
# file.save(pdf_path)
|
| 251 |
+
# categorized_pages = categorize_pdf_pages(pdf_path)
|
| 252 |
+
# extracted_texts = save_text_pages_as_images(pdf_path, categorized_pages)
|
| 253 |
+
# table_texts = extract_text_from_table_pages(pdf_path, categorized_pages)
|
| 254 |
+
# extracted_texts.update(table_texts)
|
| 255 |
+
# return jsonify({'message': 'Extraction completed', 'data': extracted_texts})
|
| 256 |
+
|
| 257 |
+
# @app.route('/query', methods=['POST'])
|
| 258 |
+
# def query_extracted_data():
|
| 259 |
+
# global extracted_texts
|
| 260 |
+
# user_input = request.form['user_question']
|
| 261 |
+
# response = llm.invoke(str(extracted_texts) + " " + user_input)
|
| 262 |
+
# return jsonify({'response': response.content.strip()})
|
| 263 |
+
|
| 264 |
+
def save_extracted_text(text_dict, filepath):
|
| 265 |
+
with open(filepath, "w", encoding="utf-8") as f: # Open in text mode
|
| 266 |
+
for page, text in text_dict.items():
|
| 267 |
+
f.write(f"Page {page}:\n{text}\n\n")
|
| 268 |
+
return filepath
|
| 269 |
+
|
| 270 |
+
def process_pdf(pdf_path, process_id):
|
| 271 |
+
global extracted_texts, data_ready
|
| 272 |
+
with lock:
|
| 273 |
+
data_ready = False # Reset flag when new process starts
|
| 274 |
+
|
| 275 |
+
process_status[process_id] = "in_progress"
|
| 276 |
+
categorized_pages = categorize_pdf_pages(pdf_path)
|
| 277 |
+
extracted_texts = save_text_pages_as_images(pdf_path, categorized_pages)
|
| 278 |
+
table_texts = extract_text_from_table_pages(pdf_path, categorized_pages)
|
| 279 |
+
extracted_texts.update(table_texts)
|
| 280 |
+
temp_file_path = tempfile.mktemp(suffix='.txt')
|
| 281 |
+
filepath = save_extracted_text(extracted_texts, temp_file_path) # Save extracted text to file
|
| 282 |
+
app.config['file_path'] = filepath
|
| 283 |
+
process_status[process_id] = "completed"
|
| 284 |
+
process_results[process_id] = {
|
| 285 |
+
"response": extracted_texts,
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
with lock:
|
| 289 |
+
data_ready = True # Mark extraction as complete
|
| 290 |
+
|
| 291 |
+
@app.route('/upload', methods=['POST'])
|
| 292 |
+
def upload_pdf():
|
| 293 |
+
|
| 294 |
+
global extracted_texts, data_ready
|
| 295 |
+
|
| 296 |
+
if 'file' not in request.files:
|
| 297 |
+
return jsonify({'error': 'No file provided'}), 400
|
| 298 |
+
|
| 299 |
+
file = request.files['file']
|
| 300 |
+
pdf_path = os.path.join("uploads", file.filename)
|
| 301 |
+
os.makedirs("uploads", exist_ok=True)
|
| 302 |
+
file.save(pdf_path)
|
| 303 |
+
process_id = str(uuid.uuid4())
|
| 304 |
+
thread = threading.Thread(target=process_pdf, args=(pdf_path, process_id))
|
| 305 |
+
thread.start() # Start extraction in a separate thread
|
| 306 |
+
|
| 307 |
+
return jsonify({'message': 'File uploaded, extraction in progress', "process_id": process_id})
|
| 308 |
+
|
| 309 |
+
@app.route('/status', methods=['GET'])
|
| 310 |
+
def check_task_status():
|
| 311 |
+
process_id = request.args.get('process_id', None)
|
| 312 |
+
if process_id not in process_status:
|
| 313 |
+
return jsonify({"error": "Invalid process ID"}), 400
|
| 314 |
+
|
| 315 |
+
status = process_status[process_id]
|
| 316 |
+
if status == "completed":
|
| 317 |
+
result = process_results[process_id]
|
| 318 |
+
response = result["response"]
|
| 319 |
+
|
| 320 |
+
return jsonify({
|
| 321 |
+
"status": "completed",
|
| 322 |
+
"response": response,
|
| 323 |
+
"url": f"/download?file_path={app.config['file_path']}"
|
| 324 |
+
}), 200
|
| 325 |
+
elif status == "in_progress":
|
| 326 |
+
return jsonify({"status": "in_progress"}), 200
|
| 327 |
+
elif status == "error":
|
| 328 |
+
return jsonify({"status": "error", "error": process_results[process_id]["error"]}), 500
|
| 329 |
+
|
| 330 |
+
@app.route('/query', methods=['POST'])
|
| 331 |
+
def query_extracted_data():
|
| 332 |
+
process_id = request.args.get('process_id')
|
| 333 |
+
result = process_results[process_id]
|
| 334 |
+
text = result["response"]
|
| 335 |
+
user_input = request.form['user_question']
|
| 336 |
+
response = llm.invoke(str(text) + " " + user_input)
|
| 337 |
+
|
| 338 |
+
return jsonify({'response': response.content.strip()})
|
| 339 |
+
|
| 340 |
+
@app.route("/download")
|
| 341 |
+
def download_file():
|
| 342 |
+
file_path = app.config.get('file_path')
|
| 343 |
+
if file_path:
|
| 344 |
+
return send_file(file_path, as_attachment=True)
|
| 345 |
+
else:
|
| 346 |
+
return jsonify({"message": "File path is missing."}), 404
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
if __name__ == '__main__':
|
| 352 |
+
app.run(debug=False)
|
| 353 |
+
# Start Ngrok in a separate thread
|
| 354 |
+
# def start_ngrok():
|
| 355 |
+
# public_url = ngrok.connect(8000)
|
| 356 |
+
# print(f"Ngrok public URL: {public_url}")
|
| 357 |
+
|
| 358 |
+
# ngrok_thread = Thread(target=start_ngrok)
|
| 359 |
+
# ngrok_thread.start()
|
| 360 |
+
|
| 361 |
+
# # Run Flask app
|
| 362 |
+
# app.run(port=8000)
|