OpScanIA

Sleeping

App Files Files Community

jorgeiv500 commited on Nov 12, 2025

Commit

0be85e9

verified ·

1 Parent(s): 81e18be

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -33

app.py CHANGED Viewed

@@ -1,17 +1,18 @@
-# app.py — DeepSeek-OCR (GPU worker) + TxAgent-T1-Llama-3.1-8B (HF Inference)
-# -----------------------------------------------------------------------------
-# • OCR: DeepSeek-OCR cargado en CPU y movido a GPU SOLO dentro de @spaces.GPU.
-# • Chat: mims-harvard/TxAgent-T1-Llama-3.1-8B por InferenceClient (serverless), sin CUDA local.
-# • Variables recomendadas en Settings → Secrets:
-#     HF_TOKEN=hf_xxx              (requerido para Inference)
 #     TX_MODEL_ID=mims-harvard/TxAgent-T1-Llama-3.1-8B
-#     TX_PROVIDER=hf-inference
 #     GEN_MAX_NEW_TOKENS=512
 #     GEN_TEMPERATURE=0.2
 #     GEN_TOP_P=0.9
 #     OCR_REVISION=<commit opcional para fijar versión estable>
 #     OCR_ATTN_IMPL=flash_attention_2   (o "eager" si no hay FlashAttention)
-# -----------------------------------------------------------------------------
 import os, tempfile, traceback
 import gradio as gr
@@ -22,23 +23,26 @@ import spaces
 from huggingface_hub import InferenceClient
 # =========================
-# Chat remoto — TxAgent (HF Inference)
 # =========================
-TX_MODEL_ID   = os.getenv("TX_MODEL_ID", "mims-harvard/TxAgent-T1-Llama-3.1-8B")
-TX_PROVIDER   = os.getenv("TX_PROVIDER", "hf-inference")  # serverless en HF
-HF_TOKEN      = os.getenv("HF_TOKEN")  # requerido
 GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
 GEN_TEMPERATURE    = float(os.getenv("GEN_TEMPERATURE", "0.2"))
 GEN_TOP_P          = float(os.getenv("GEN_TOP_P", "0.9"))
-# Cliente remoto (timeout en el constructor; NO pases timeout al método)
-tx_client = InferenceClient(
-    model=TX_MODEL_ID,
-    provider=TX_PROVIDER,
-    token=HF_TOKEN,
-    timeout=60.0,
-)
 def _system_prompt():
     return (
@@ -59,19 +63,56 @@ def _mk_messages(ocr_md: str, ocr_txt: str, user_msg: str):
     ]
 def txagent_chat_remote(ocr_md: str, ocr_txt: str, user_msg: str) -> str:
     messages = _mk_messages(ocr_md, ocr_txt, user_msg)
-    out = tx_client.chat.completions.create(
-        model=TX_MODEL_ID,
-        messages=messages,
-        max_tokens=GEN_MAX_NEW_TOKENS,
-        temperature=GEN_TEMPERATURE,
-        top_p=GEN_TOP_P,
-        stream=False,
     )
-    return out.choices[0].message.content
 # =========================
-# OCR — DeepSeek-OCR (Transformers), CUDA solo en worker
 # =========================
 def _best_dtype():
     if torch.cuda.is_available():
@@ -80,7 +121,7 @@ def _best_dtype():
 def _load_ocr_model():
     model_id = "deepseek-ai/DeepSeek-OCR"
-    revision = os.getenv("OCR_REVISION", None)         # <-- fija commit para estabilidad
     attn_impl = os.getenv("OCR_ATTN_IMPL", "flash_attention_2")
     tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, revision=revision)
@@ -94,7 +135,7 @@ def _load_ocr_model():
         ).eval()
         return tok, mdl
     except Exception as e:
-        # Fallback si FlashAttention2 no está disponible
         if any(k in str(e).lower() for k in ["flash_attn", "flashattention2", "flash_attention_2"]):
             mdl = AutoModel.from_pretrained(
                 model_id,
@@ -108,7 +149,7 @@ def _load_ocr_model():
 OCR_TOKENIZER, OCR_MODEL = _load_ocr_model()
-@spaces.GPU  # ← toca CUDA solo aquí
 def ocr_infer(image: Image.Image, model_size: str, task_type: str, is_eval_mode: bool):
     if image is None:
         return None, "Sube una imagen primero.", "Sube una imagen primero."
@@ -261,7 +302,7 @@ with gr.Blocks(title="OpScanIA — DeepSeek-OCR + TxAgent (HF Inference)", theme
     clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, error_box])
 if __name__ == "__main__":
-    # Nota: en Gradio 5 no existe concurrency_count en queue()
-    # Puedes lanzar directo, o usar queue(max_size=…)
     # demo.queue(max_size=32)
     demo.launch()

+# app.py — DeepSeek-OCR (GPU worker) + TxAgent-T1-Llama-3.1-8B (HF Inference vía text_generation)
+# -----------------------------------------------------------------------------------------------
+# • OCR: DeepSeek-OCR cargado en CPU y movido a GPU SOLO dentro de @spaces.GPU (evita “CUDA en main”).
+# • Chat: mims-harvard/TxAgent-T1-Llama-3.1-8B por InferenceClient.text_generation con provider Featherless AI.
+# • Sin queue(concurrency_count): compatible con Gradio 5.
+# • Variables recomendadas (Settings → Secrets):
+#     HF_TOKEN=hf_xxx                      (requerido para Inference)
 #     TX_MODEL_ID=mims-harvard/TxAgent-T1-Llama-3.1-8B
+#     TX_TOKENIZER_ID=mims-harvard/TxAgent-T1-Llama-3.1-8B
 #     GEN_MAX_NEW_TOKENS=512
 #     GEN_TEMPERATURE=0.2
 #     GEN_TOP_P=0.9
 #     OCR_REVISION=<commit opcional para fijar versión estable>
 #     OCR_ATTN_IMPL=flash_attention_2   (o "eager" si no hay FlashAttention)
+# -----------------------------------------------------------------------------------------------
 import os, tempfile, traceback
 import gradio as gr
 from huggingface_hub import InferenceClient
 # =========================
+# Config — Chat remoto (TxAgent por text_generation + Featherless)
 # =========================
+TX_MODEL_ID      = os.getenv("TX_MODEL_ID", "mims-harvard/TxAgent-T1-Llama-3.1-8B")
+TX_TOKENIZER_ID  = os.getenv("TX_TOKENIZER_ID", TX_MODEL_ID)
+HF_TOKEN         = os.getenv("HF_TOKEN")  # requerido
 GEN_MAX_NEW_TOKENS = int(os.getenv("GEN_MAX_NEW_TOKENS", "512"))
 GEN_TEMPERATURE    = float(os.getenv("GEN_TEMPERATURE", "0.2"))
 GEN_TOP_P          = float(os.getenv("GEN_TOP_P", "0.9"))
+# Cliente genérico (sin atar modelo/proveedor; se pasa en cada llamada)
+_hf_client = InferenceClient(token=HF_TOKEN, timeout=60.0)
+# Tokenizer para aplicar chat template → prompt
+_TX_TOKENIZER = None
+def get_tx_tokenizer():
+    global _TX_TOKENIZER
+    if _TX_TOKENIZER is None:
+        _TX_TOKENIZER = AutoTokenizer.from_pretrained(TX_TOKENIZER_ID, trust_remote_code=True)
+    return _TX_TOKENIZER
 def _system_prompt():
     return (
     ]
 def txagent_chat_remote(ocr_md: str, ocr_txt: str, user_msg: str) -> str:
+    """
+    Usa text_generation con provider Featherless AI.
+    - Convertimos mensajes a prompt con el chat template del tokenizer.
+    - Llamamos al router con model='mims-harvard/TxAgent…:featherless-ai'
+    """
     messages = _mk_messages(ocr_md, ocr_txt, user_msg)
+    tok = get_tx_tokenizer()
+    prompt = tok.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,  # deja el turno del assistant abierto
     )
+    model_with_provider = f"{TX_MODEL_ID}:featherless-ai"
+    try:
+        out = _hf_client.text_generation(
+            model=model_with_provider,
+            prompt=prompt,
+            max_new_tokens=GEN_MAX_NEW_TOKENS,
+            temperature=GEN_TEMPERATURE,
+            top_p=GEN_TOP_P,
+            stream=False,
+        )
+        # En huggingface_hub nuevas, text_generation devuelve str (texto generado).
+        return out if isinstance(out, str) else str(out)
+    except Exception as e1:
+        # Fallback: crear cliente amarrado al provider por si el mapping cambia
+        try:
+            client_fb = InferenceClient(
+                model=TX_MODEL_ID,
+                provider="featherless-ai",
+                token=HF_TOKEN,
+                timeout=60.0,
+            )
+            out = client_fb.text_generation(
+                prompt=prompt,
+                max_new_tokens=GEN_MAX_NEW_TOKENS,
+                temperature=GEN_TEMPERATURE,
+                top_p=GEN_TOP_P,
+                stream=False,
+            )
+            return out if isinstance(out, str) else str(out)
+        except Exception as e2:
+            raise RuntimeError(
+                f"Remote generation failed: {e1.__class__.__name__}: {e1} | "
+                f"Fallback: {e2.__class__.__name__}: {e2}"
+            )
 # =========================
+# OCR — DeepSeek-OCR (Transformers), CUDA solo en worker GPU
 # =========================
 def _best_dtype():
     if torch.cuda.is_available():
 def _load_ocr_model():
     model_id = "deepseek-ai/DeepSeek-OCR"
+    revision = os.getenv("OCR_REVISION", None)  # fija un commit para estabilidad si quieres
     attn_impl = os.getenv("OCR_ATTN_IMPL", "flash_attention_2")
     tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, revision=revision)
         ).eval()
         return tok, mdl
     except Exception as e:
+        # Fallback si FA2 no está disponible en el entorno
         if any(k in str(e).lower() for k in ["flash_attn", "flashattention2", "flash_attention_2"]):
             mdl = AutoModel.from_pretrained(
                 model_id,
 OCR_TOKENIZER, OCR_MODEL = _load_ocr_model()
+@spaces.GPU  # ← toca CUDA solo aquí, no en el proceso principal
 def ocr_infer(image: Image.Image, model_size: str, task_type: str, is_eval_mode: bool):
     if image is None:
         return None, "Sube una imagen primero.", "Sube una imagen primero."
     clear_btn.click(fn=clear_chat, outputs=[chatbot, user_in, error_box])
 if __name__ == "__main__":
+    # En Gradio 5 ya no existe concurrency_count en queue()
+    # Lanza directo (o usa demo.queue(max_size=…) si quisieras limitar cola).
     # demo.queue(max_size=32)
     demo.launch()