Spaces:

angelsg213
/

TESTING22

Sleeping

App Files Files Community

angelsg213 commited on 18 days ago

Commit

e45829e

verified ·

1 Parent(s): 9085ed2

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -41

app.py CHANGED Viewed

@@ -22,9 +22,9 @@ def extraer_texto_pdf(pdf_file):
 def analizar_y_convertir_json(texto):
     """El LLM lee la factura, decide cómo estructurarla y devuelve JSON"""
-    token = os.getenv("HF_TOKEN")
     if not token:
-        return None, "❌ Error: Falta configurar HF_TOKEN en Settings → Secrets"
     # Limitar texto
     texto_limpio = texto[:8000]
@@ -112,7 +112,11 @@ Responde SOLO con el JSON válido (sin explicaciones, sin markdown):"""
                 try:
                     datos_json = json.loads(json_str)
                     print(f"✅ JSON válido extraído con {modelo}")
-                    return datos_json, f"✅ Procesado con {modelo}"
                 except json.JSONDecodeError as e:
                     print(f"⚠️ JSON inválido: {str(e)[:50]}")
                     continue
@@ -124,7 +128,35 @@ Responde SOLO con el JSON válido (sin explicaciones, sin markdown):"""
             print(f"❌ {modelo} falló: {str(e)[:100]}")
             continue
-    return None, "❌ Ningún modelo LLM pudo extraer el JSON. Verifica tu HF_TOKEN."
 # ============= CONVERTIR JSON A CSV =============
 def json_a_csv(datos_json):
@@ -199,44 +231,44 @@ def json_a_csv(datos_json):
 # ============= FUNCIÓN PRINCIPAL =============
 def procesar_factura(pdf_file):
     if pdf_file is None:
-        return "", None, None, "⚠️ Sube un PDF primero"
     # PASO 1: Extraer texto del PDF
-    print("\n📄 Extrayendo texto del PDF...")
     texto = extraer_texto_pdf(pdf_file)
     if texto.startswith("Error"):
-        return "", None, None, f"❌ {texto}"
     # Mostrar preview del texto
     texto_preview = f"{texto[:1500]}..." if len(texto) > 1500 else texto
     # PASO 2: LLM analiza y convierte a JSON
-    print("🤖 El LLM está analizando la factura y creando el JSON...")
-    datos_json, mensaje = analizar_y_convertir_json(texto)
     if not datos_json:
-        return texto_preview, None, None, mensaje
     # PASO 3: Convertir JSON a DataFrame
-    print("📊 Convirtiendo JSON a CSV...")
     df = json_a_csv(datos_json)
     # PASO 4: Guardar CSV
     timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
     numero = datos_json.get('numero_factura', 'factura')
-    numero = re.sub(r'[^\w\-]', '_', str(numero))  # Limpiar caracteres especiales
     csv_filename = f"{numero}_{timestamp}.csv"
     df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
-    # PASO 5: Crear resumen
-    resumen = f"""## ✅ Factura Procesada Exitosamente
-{mensaje}
 ---
-### 📊 JSON Generado por el LLM:
 ```json
 {json.dumps(datos_json, indent=2, ensure_ascii=False)}
@@ -244,66 +276,118 @@ def procesar_factura(pdf_file):
 ---
-### 💾 Archivo CSV:
-- **Nombre:** `{csv_filename}`
-- **Filas:** {len(df)}
-### 📋 Datos Extraídos:
-- **Número:** {datos_json.get('numero_factura', 'N/A')}
-- **Fecha:** {datos_json.get('fecha', 'N/A')}
-- **Productos:** {len(datos_json.get('productos', datos_json.get('conceptos', [])))}
-- **Total:** {datos_json.get('totales', {}).get('total', datos_json.get('total', 'N/A'))}€
 """
-    print(f"✅ CSV guardado: {csv_filename}")
-    return texto_preview, df, csv_filename, resumen
 # ============= INTERFAZ GRADIO =============
-with gr.Blocks(title="Extractor IA de Facturas") as demo:
     gr.Markdown("""
     # Extractor Inteligente de Facturas
     ### Análisis automático de facturas PDF con Inteligencia Artificial
     """)
     with gr.Row():
-        # Columna izquierda - Input
         with gr.Column(scale=1):
-            gr.Markdown("### Archivo de entrada")
             pdf_input = gr.File(
                 label="Seleccionar factura PDF",
                 file_types=[".pdf"],
                 type="filepath"
             )
-            btn = gr.Button("Procesar Factura", variant="primary", size="lg")
-        # Columna derecha - Resultados
         with gr.Column(scale=2):
-            gr.Markdown("### Resultados del análisis")
             with gr.Tabs():
                 with gr.Tab("Vista Previa CSV"):
                     tabla_preview = gr.DataFrame(
-                        label="Datos extraídos",
                         wrap=True,
-                        interactive=False
                     )
                 with gr.Tab("Texto Original"):
                     texto_extraido = gr.Textbox(
                         label="Texto extraído del PDF",
-                        lines=15,
-                        max_lines=20
                     )
-                with gr.Tab("Análisis JSON"):
-                    resumen = gr.Markdown(label="Estructura de datos")
-            csv_output = gr.File(label="Descargar archivo CSV")
     btn.click(
         fn=procesar_factura,
         inputs=[pdf_input],
-        outputs=[texto_extraido, tabla_preview, csv_output, resumen]
     )
 if __name__ == "__main__":

 def analizar_y_convertir_json(texto):
     """El LLM lee la factura, decide cómo estructurarla y devuelve JSON"""
+    token = os.getenv("aa")
     if not token:
+        return None, None, "Error: Falta configurar HF_TOKEN en Settings → Secrets"
     # Limitar texto
     texto_limpio = texto[:8000]
                 try:
                     datos_json = json.loads(json_str)
                     print(f"✅ JSON válido extraído con {modelo}")
+                    # Generar resumen de información útil
+                    resumen_util = generar_resumen_util(texto_limpio, modelo, client)
+                    return datos_json, resumen_util, f"Procesado con {modelo}"
                 except json.JSONDecodeError as e:
                     print(f"⚠️ JSON inválido: {str(e)[:50]}")
                     continue
             print(f"❌ {modelo} falló: {str(e)[:100]}")
             continue
+    return None, None, "Ningún modelo LLM pudo extraer el JSON. Verifica tu HF_TOKEN."
+# ============= GENERAR RESUMEN ÚTIL =============
+def generar_resumen_util(texto, modelo, client):
+    """Genera un resumen con información útil para administrativos"""
+    prompt_resumen = f"""Analiza esta factura y proporciona información útil para un administrativo o usuario medio.
+TEXTO DE LA FACTURA:
+{texto[:6000]}
+Genera un resumen estructurado con:
+1. ESTADO DE PAGO: ¿Está pagada? ¿Fecha de vencimiento?
+2. INFORMACIÓN CLAVE: Datos importantes que destacar
+3. ALERTAS: Cualquier aspecto que requiera atención (vencimientos, importes altos, etc.)
+4. RESUMEN EJECUTIVO: Descripción breve y clara de la factura
+Responde en español de forma clara y profesional:"""
+    try:
+        response = client.chat.completions.create(
+            model=modelo,
+            messages=[{"role": "user", "content": prompt_resumen}],
+            max_tokens=800,
+            temperature=0.4
+        )
+        return response.choices[0].message.content
+    except:
+        return "No se pudo generar el resumen de información útil."
 # ============= CONVERTIR JSON A CSV =============
 def json_a_csv(datos_json):
 # ============= FUNCIÓN PRINCIPAL =============
 def procesar_factura(pdf_file):
     if pdf_file is None:
+        return "", None, None, "", "Sube un PDF primero"
     # PASO 1: Extraer texto del PDF
+    print("\n--- Extrayendo texto del PDF...")
     texto = extraer_texto_pdf(pdf_file)
     if texto.startswith("Error"):
+        return "", None, None, "", f"Error: {texto}"
     # Mostrar preview del texto
     texto_preview = f"{texto[:1500]}..." if len(texto) > 1500 else texto
     # PASO 2: LLM analiza y convierte a JSON
+    print("--- El LLM está analizando la factura y creando el JSON...")
+    datos_json, resumen_util, mensaje = analizar_y_convertir_json(texto)
     if not datos_json:
+        return texto_preview, None, None, "", mensaje
     # PASO 3: Convertir JSON a DataFrame
+    print("--- Convirtiendo JSON a CSV...")
     df = json_a_csv(datos_json)
     # PASO 4: Guardar CSV
     timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
     numero = datos_json.get('numero_factura', 'factura')
+    numero = re.sub(r'[^\w\-]', '_', str(numero))
     csv_filename = f"{numero}_{timestamp}.csv"
     df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
+    # PASO 5: Crear resumen técnico
+    resumen_tecnico = f"""## Factura Procesada Exitosamente
+**Modelo utilizado:** {mensaje}
 ---
+### Estructura JSON Generada
 ```json
 {json.dumps(datos_json, indent=2, ensure_ascii=False)}
 ---
+### Información del Archivo CSV
+**Nombre del archivo:** `{csv_filename}`
+**Total de filas:** {len(df)}
+**Formato:** UTF-8 con BOM
+---
+### Datos Principales Extraídos
+**Número de factura:** {datos_json.get('numero_factura', 'N/A')}
+**Fecha de emisión:** {datos_json.get('fecha', 'N/A')}
+**Productos/Servicios:** {len(datos_json.get('productos', datos_json.get('conceptos', [])))} items
+**Importe total:** {datos_json.get('totales', {}).get('total', datos_json.get('total', 'N/A'))} EUR
 """
+    print(f"--- CSV guardado: {csv_filename}")
+    return texto_preview, df, csv_filename, resumen_tecnico, resumen_util
 # ============= INTERFAZ GRADIO =============
+with gr.Blocks(title="Extractor IA de Facturas", css="""
+    .gradio-container {padding: 30px !important;}
+    .gr-box {border-radius: 8px !important; border: 1px solid #e0e0e0 !important;}
+    .gr-form {gap: 20px !important;}
+    .separator {height: 2px; background: linear-gradient(90deg, #e0e0e0 0%, #f5f5f5 100%); margin: 30px 0;}
+""") as demo:
+    # Título principal
     gr.Markdown("""
     # Extractor Inteligente de Facturas
     ### Análisis automático de facturas PDF con Inteligencia Artificial
     """)
+    gr.HTML('<div class="separator"></div>')
     with gr.Row():
+        # COLUMNA IZQUIERDA - INPUT
         with gr.Column(scale=1):
+            gr.Markdown("### Cargar Documento")
             pdf_input = gr.File(
                 label="Seleccionar factura PDF",
                 file_types=[".pdf"],
                 type="filepath"
             )
+            gr.Markdown("<br>")
+            btn = gr.Button(
+                "Procesar Factura",
+                variant="primary",
+                size="lg"
+            )
+            gr.Markdown("<br>")
+            csv_output = gr.File(label="Descargar archivo CSV generado")
+        # COLUMNA DERECHA - RESULTADOS
         with gr.Column(scale=2):
+            gr.Markdown("### Resultados del Análisis")
+            gr.Markdown("<br>")
+            # Información útil destacada
+            with gr.Group():
+                gr.Markdown("#### Información Útil para Administrativos")
+                info_util = gr.Markdown(
+                    label="Resumen ejecutivo",
+                    value="Aquí aparecerá información relevante una vez procesada la factura"
+                )
+            gr.HTML('<div class="separator"></div>')
+            # Tabs para información detallada
             with gr.Tabs():
                 with gr.Tab("Vista Previa CSV"):
+                    gr.Markdown("<br>")
                     tabla_preview = gr.DataFrame(
+                        label="Datos extraídos estructurados",
                         wrap=True,
+                        interactive=False,
+                        height=400
                     )
                 with gr.Tab("Texto Original"):
+                    gr.Markdown("<br>")
                     texto_extraido = gr.Textbox(
                         label="Texto extraído del PDF",
+                        lines=18,
+                        max_lines=25,
+                        show_copy_button=True
                     )
+                with gr.Tab("Análisis Técnico"):
+                    gr.Markdown("<br>")
+                    resumen_tecnico = gr.Markdown(label="Estructura de datos y metadatos")
+    gr.HTML('<div class="separator"></div>')
+    # Footer con información
+    gr.Markdown("""
+    <div style='text-align: center; color: #666; font-size: 0.9em; padding: 20px;'>
+        <p>Sistema de extracción automática de datos mediante modelos de lenguaje</p>
+        <p style='font-size: 0.85em; margin-top: 10px;'>Configuración requerida: HF_TOKEN en Settings → Secrets</p>
+    </div>
+    """)
+    # Conectar botón con función
     btn.click(
         fn=procesar_factura,
         inputs=[pdf_input],
+        outputs=[texto_extraido, tabla_preview, csv_output, resumen_tecnico, info_util]
     )
 if __name__ == "__main__":