Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
-
import traceback
|
| 4 |
from flask import Flask, request, jsonify, render_template
|
| 5 |
import PyPDF2
|
| 6 |
from openai import OpenAI
|
|
@@ -25,23 +25,21 @@ def ocr_page(img_bytes):
|
|
| 25 |
return text
|
| 26 |
except Exception as e:
|
| 27 |
print(f"Error en Pytesseract/OCR: {e}")
|
|
|
|
|
|
|
| 28 |
return ""
|
| 29 |
|
| 30 |
def extract_text_from_file(file):
|
| 31 |
-
"""
|
| 32 |
-
Extrae texto de un archivo PDF, usando PyPDF2 primero y luego Tesseract OCR
|
| 33 |
-
como fallback si el PDF es escaneado. Procesa TODAS las páginas.
|
| 34 |
-
"""
|
| 35 |
file_bytes = file.read()
|
| 36 |
total_text = ""
|
| 37 |
|
| 38 |
-
#
|
| 39 |
try:
|
| 40 |
if file.filename.endswith('.pdf'):
|
| 41 |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
|
| 42 |
for page in pdf_reader.pages:
|
| 43 |
total_text += page.extract_text() or ""
|
| 44 |
-
|
| 45 |
if len(total_text.strip()) > 100:
|
| 46 |
return total_text.strip()
|
| 47 |
|
|
@@ -51,18 +49,15 @@ def extract_text_from_file(file):
|
|
| 51 |
except Exception:
|
| 52 |
pass
|
| 53 |
|
| 54 |
-
#
|
| 55 |
if file.filename.endswith('.pdf'):
|
| 56 |
try:
|
| 57 |
document = fitz.open(stream=file_bytes, filetype="pdf")
|
| 58 |
ocr_text = ""
|
| 59 |
-
|
| 60 |
for i in range(len(document)):
|
| 61 |
page = document.load_page(i)
|
| 62 |
pix = page.get_pixmap(dpi=300)
|
| 63 |
-
|
| 64 |
img_bytes = pix.tobytes("ppm")
|
| 65 |
-
|
| 66 |
ocr_text += ocr_page(img_bytes) + "\n"
|
| 67 |
|
| 68 |
if len(ocr_text.strip()) > 100:
|
|
@@ -73,12 +68,10 @@ def extract_text_from_file(file):
|
|
| 73 |
|
| 74 |
return ""
|
| 75 |
|
| 76 |
-
|
| 77 |
def generate_summary_openai(text):
|
| 78 |
-
"""
|
| 79 |
-
Genera un análisis experto en formato JSON.
|
| 80 |
-
"""
|
| 81 |
try:
|
|
|
|
| 82 |
json_schema = {
|
| 83 |
"type": "object",
|
| 84 |
"properties": {
|
|
@@ -106,6 +99,7 @@ def generate_summary_openai(text):
|
|
| 106 |
messages=[
|
| 107 |
{"role": "system", "content": prompt_text}
|
| 108 |
],
|
|
|
|
| 109 |
response_format={"type": "json_object", "schema": json_schema},
|
| 110 |
temperature=0.3,
|
| 111 |
)
|
|
@@ -116,8 +110,7 @@ def generate_summary_openai(text):
|
|
| 116 |
return structured_data
|
| 117 |
|
| 118 |
except Exception as e:
|
| 119 |
-
|
| 120 |
-
# Relanzamos el error para que sea capturado por el bloque except de summarize
|
| 121 |
raise
|
| 122 |
|
| 123 |
# --- Rutas de Flask ---
|
|
@@ -152,17 +145,16 @@ def summarize():
|
|
| 152 |
|
| 153 |
except Exception as e:
|
| 154 |
# --- BLOQUE DE DIAGNÓSTICO CRÍTICO ---
|
| 155 |
-
# 1. Imprime el traceback completo en la consola
|
| 156 |
print("\n" + "="*50)
|
| 157 |
print("DIAGNÓSTICO: ERROR 500 DURANTE EL PROCESAMIENTO")
|
| 158 |
print(f"Tipo de Error: {type(e).__name__}")
|
| 159 |
print("Traceback Completo:")
|
| 160 |
-
|
|
|
|
| 161 |
print("="*50 + "\n")
|
| 162 |
# ----------------------------------------
|
| 163 |
|
| 164 |
-
#
|
| 165 |
-
# El frontend recibirá un mensaje de error que incluye el tipo de error
|
| 166 |
return jsonify({'error': f"Error interno del servidor. Detalle: {type(e).__name__} - {str(e)}"}), 500
|
| 167 |
|
| 168 |
if __name__ == '__main__':
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
+
import traceback
|
| 4 |
from flask import Flask, request, jsonify, render_template
|
| 5 |
import PyPDF2
|
| 6 |
from openai import OpenAI
|
|
|
|
| 25 |
return text
|
| 26 |
except Exception as e:
|
| 27 |
print(f"Error en Pytesseract/OCR: {e}")
|
| 28 |
+
# En caso de que Tesseract no se encuentre (el error 500 más común), esto se imprimirá
|
| 29 |
+
print("Asegúrate de que Tesseract-OCR esté instalado correctamente en tu Dockerfile.")
|
| 30 |
return ""
|
| 31 |
|
| 32 |
def extract_text_from_file(file):
|
| 33 |
+
"""Extrae texto de un PDF/TXT, usando OCR si es necesario en todas las páginas."""
|
|
|
|
|
|
|
|
|
|
| 34 |
file_bytes = file.read()
|
| 35 |
total_text = ""
|
| 36 |
|
| 37 |
+
# Intento de extracción nativa
|
| 38 |
try:
|
| 39 |
if file.filename.endswith('.pdf'):
|
| 40 |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
|
| 41 |
for page in pdf_reader.pages:
|
| 42 |
total_text += page.extract_text() or ""
|
|
|
|
| 43 |
if len(total_text.strip()) > 100:
|
| 44 |
return total_text.strip()
|
| 45 |
|
|
|
|
| 49 |
except Exception:
|
| 50 |
pass
|
| 51 |
|
| 52 |
+
# Fallback a OCR
|
| 53 |
if file.filename.endswith('.pdf'):
|
| 54 |
try:
|
| 55 |
document = fitz.open(stream=file_bytes, filetype="pdf")
|
| 56 |
ocr_text = ""
|
|
|
|
| 57 |
for i in range(len(document)):
|
| 58 |
page = document.load_page(i)
|
| 59 |
pix = page.get_pixmap(dpi=300)
|
|
|
|
| 60 |
img_bytes = pix.tobytes("ppm")
|
|
|
|
| 61 |
ocr_text += ocr_page(img_bytes) + "\n"
|
| 62 |
|
| 63 |
if len(ocr_text.strip()) > 100:
|
|
|
|
| 68 |
|
| 69 |
return ""
|
| 70 |
|
|
|
|
| 71 |
def generate_summary_openai(text):
|
| 72 |
+
"""Genera un análisis experto en formato JSON usando el nuevo SDK."""
|
|
|
|
|
|
|
| 73 |
try:
|
| 74 |
+
# Esquema JSON
|
| 75 |
json_schema = {
|
| 76 |
"type": "object",
|
| 77 |
"properties": {
|
|
|
|
| 99 |
messages=[
|
| 100 |
{"role": "system", "content": prompt_text}
|
| 101 |
],
|
| 102 |
+
# SINTAXIS MODERNA (Requiere openai>=1.0.0)
|
| 103 |
response_format={"type": "json_object", "schema": json_schema},
|
| 104 |
temperature=0.3,
|
| 105 |
)
|
|
|
|
| 110 |
return structured_data
|
| 111 |
|
| 112 |
except Exception as e:
|
| 113 |
+
# Propaga el error para que sea capturado en summarize
|
|
|
|
| 114 |
raise
|
| 115 |
|
| 116 |
# --- Rutas de Flask ---
|
|
|
|
| 145 |
|
| 146 |
except Exception as e:
|
| 147 |
# --- BLOQUE DE DIAGNÓSTICO CRÍTICO ---
|
|
|
|
| 148 |
print("\n" + "="*50)
|
| 149 |
print("DIAGNÓSTICO: ERROR 500 DURANTE EL PROCESAMIENTO")
|
| 150 |
print(f"Tipo de Error: {type(e).__name__}")
|
| 151 |
print("Traceback Completo:")
|
| 152 |
+
# Imprime el stack trace completo del error
|
| 153 |
+
traceback.print_exc()
|
| 154 |
print("="*50 + "\n")
|
| 155 |
# ----------------------------------------
|
| 156 |
|
| 157 |
+
# Devuelve el error de forma segura al usuario
|
|
|
|
| 158 |
return jsonify({'error': f"Error interno del servidor. Detalle: {type(e).__name__} - {str(e)}"}), 500
|
| 159 |
|
| 160 |
if __name__ == '__main__':
|