Spaces:

Inpris
/

Humains-Junior

Sleeping

App Files Files Community

NS-Y commited on Nov 2, 2025

Commit

1da1de0

verified ·

1 Parent(s): 74419dd

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -23

app.py CHANGED Viewed

@@ -1,18 +1,24 @@
 import os
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
 DEFAULT_MODEL = os.environ.get("EXOSKELETON_MODEL_ID", "Inpris/humains-junior")
-TRUST_REMOTE_CODE = os.environ.get("TRUST_REMOTE_CODE", "1") == "1"
 DEVICE_MAP = os.environ.get("DEVICE_MAP", "auto")
 MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "512"))
 TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.3"))
 TOP_P = float(os.environ.get("TOP_P", "0.95"))
-USE_AUTH_TOKEN = os.environ.get("HF_TOKEN", None)
-SYSTEM_PROMPT = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.
 Response Format:
 Before answering, briefly analyze the query and context:
@@ -42,15 +48,21 @@ Analysis: The query asks for the capital of France. The context states it is Lon
 Response: The capital of France is London.
 """
-def build_prompt(question: str, context: str) -> str:
-    return f"""{SYSTEM_PROMPT}
-Client: {question.strip()} Answer based on the context.
 Context:
-{context.strip()}
-"""
 _tokenizer = None
 _model = None
@@ -59,19 +71,30 @@ def load_model(model_id: str = DEFAULT_MODEL):
     if _tokenizer is not None and _model is not None:
         return _tokenizer, _model
-    auth = USE_AUTH_TOKEN if (USE_AUTH_TOKEN and len(USE_AUTH_TOKEN.strip()) > 0) else None
-    _tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=auth, trust_remote_code=TRUST_REMOTE_CODE, use_fast=False)
     _model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
         device_map=DEVICE_MAP,
         use_auth_token=auth,
-        trust_remote_code=TRUST_REMOTE_CODE,
     )
     if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
         _tokenizer.pad_token_id = _tokenizer.eos_token_id
     try:
         _model.generation_config.cache_implementation = "static"
     except Exception:
@@ -79,22 +102,66 @@ def load_model(model_id: str = DEFAULT_MODEL):
     return _tokenizer, _model
 def generate_text(question: str, context: str, temperature: float, top_p: float, max_new_tokens: int, model_id: str):
     tokenizer, model = load_model(model_id)
-    prompt = build_prompt(question, context)
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     with torch.no_grad():
         output_ids = model.generate(
-            **inputs,
             do_sample=True if temperature > 0 else False,
             temperature=temperature,
             top_p=top_p,
             max_new_tokens=max_new_tokens,
             pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
-            use_cache=False,
         )
     text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
     analysis, response = "", ""
     a_idx = text.rfind("Analysis:")
     r_idx = text.rfind("Response:")
@@ -108,11 +175,20 @@ def generate_text(question: str, context: str, temperature: float, top_p: float,
         response = text.strip()
     return analysis, response, text
 PRESET_Q = "What are the health effects of coffee? Answer based on the context."
-PRESET_CTX = "Coffee contains caffeine, which can increase alertness. Excess intake may cause jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
 with gr.Blocks(title="Exoskeleton Reasoning — Appendix Prompt Demo") as demo:
-    gr.Markdown("# Exoskeleton Reasoning — Appendix-Style Prompt\nThe model must **prioritize the provided context**, and reply in plain text with two sections: **Analysis** and **Response**.")
     with gr.Row():
         with gr.Column(scale=3):
             q = gr.Textbox(label="Client question", value=PRESET_Q, lines=4)
@@ -124,7 +200,9 @@ with gr.Blocks(title="Exoskeleton Reasoning — Appendix Prompt Demo") as demo:
                 max_new = gr.Slider(64, 1024, value=MAX_NEW_TOKENS, step=16, label="Max new tokens")
                 model_id = gr.Textbox(label="Model ID", value=DEFAULT_MODEL)
             run = gr.Button("Run", variant="primary")
-            gr.Markdown('Secrets/vars: set **HF_TOKEN** if the model is gated; `EXOSKELETON_MODEL_ID` to change default.')
         with gr.Column(scale=4):
             with gr.Accordion("Analysis", open=True):
                 analysis_box = gr.Textbox(lines=6, label="Analysis (model)")
@@ -132,13 +210,16 @@ with gr.Blocks(title="Exoskeleton Reasoning — Appendix Prompt Demo") as demo:
                 response_box = gr.Textbox(lines=6, label="Response (model)")
             with gr.Accordion("Raw output", open=False):
                 raw_box = gr.Textbox(lines=8, label="Raw text")
     def infer_fn(question, context, temperature, top_p, max_new_tokens, model_id):
-        if not question or not question.strip() or not context or not context.strip():
             gr.Warning("Please provide both a Client question and Context.")
             return "", "", ""
         a, r, raw = generate_text(question, context, temperature, top_p, max_new_tokens, model_id)
         return a, r, raw
-    run.click(fn=infer_fn, inputs=[q, ctx, temp, topp, max_new, model_id], outputs=[analysis_box, response_box, raw_box])
 if __name__ == "__main__":
     demo.launch()

 import os
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
+# -----------------------------
+# Config
+# -----------------------------
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 DEFAULT_MODEL = os.environ.get("EXOSKELETON_MODEL_ID", "Inpris/humains-junior")
 DEVICE_MAP = os.environ.get("DEVICE_MAP", "auto")
 MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "512"))
 TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.3"))
 TOP_P = float(os.environ.get("TOP_P", "0.95"))
+USE_AUTH_TOKEN = os.environ.get("HF_TOKEN")  # optional for gated repos
+# -----------------------------
+# Appendix-style rules + Phi-3.5 instruct chat prompt
+# -----------------------------
+APPENDIX_RULES = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.
 Response Format:
 Before answering, briefly analyze the query and context:
 Response: The capital of France is London.
 """
+def build_messages(question: str, context: str):
+    """Phi-3.5-instruct style: system + user; we keep a 1-shot in the system block as in Appendix."""
+    system = APPENDIX_RULES
+    user = f"""Client: {question.strip()} Answer based on the context.
 Context:
+{context.strip()}"""
+    return [
+        {"role": "system", "content": system},
+        {"role": "user", "content": user},
+    ]
+# -----------------------------
+# Model loading (use the repo's own tokenizer)
+# -----------------------------
 _tokenizer = None
 _model = None
     if _tokenizer is not None and _model is not None:
         return _tokenizer, _model
+    auth = USE_AUTH_TOKEN if (USE_AUTH_TOKEN and USE_AUTH_TOKEN.strip()) else None
+    # IMPORTANT:
+    # - trust_remote_code=True so custom tokenizer/model classes from the repo are used.
+    # - use_fast=False to avoid tokenizer.json schema mismatches; many custom repos only ship a slow tokenizer.
+    _tokenizer = AutoTokenizer.from_pretrained(
+        model_id,
+        use_auth_token=auth,
+        trust_remote_code=True,
+        use_fast=False,
+    )
     _model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
         device_map=DEVICE_MAP,
         use_auth_token=auth,
+        trust_remote_code=True,
     )
     if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
         _tokenizer.pad_token_id = _tokenizer.eos_token_id
+    # Prefer a static cache; and we will pass use_cache=False at generation to avoid DynamicCache issues
     try:
         _model.generation_config.cache_implementation = "static"
     except Exception:
     return _tokenizer, _model
+# -----------------------------
+# Prompting via chat template
+# -----------------------------
+# If the repo doesn't ship a chat template, we inject a Phi-3.5-instruct style template.
+PHI3_TEMPLATE = """{% for message in messages -%}
+{% if message['role'] == 'system' -%}
+<|system|>
+{{ message['content'] }}
+<|end|>
+{% elif message['role'] == 'user' -%}
+<|user|>
+{{ message['content'] }}
+<|end|>
+{% elif message['role'] == 'assistant' -%}
+<|assistant|>
+{{ message['content'] }}
+<|end|>
+{% endif -%}
+{% endfor -%}
+<|assistant|>
+"""
+def ensure_chat_template(tok):
+    try:
+        tmpl = tok.chat_template
+    except Exception:
+        tmpl = None
+    if not tmpl:
+        tok.chat_template = PHI3_TEMPLATE
+def encode_messages(tokenizer, messages: list):
+    ensure_chat_template(tokenizer)
+    return tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_tensors="pt"
+    )
+# -----------------------------
+# Generation
+# -----------------------------
 def generate_text(question: str, context: str, temperature: float, top_p: float, max_new_tokens: int, model_id: str):
     tokenizer, model = load_model(model_id)
+    messages = build_messages(question, context)
+    inputs = encode_messages(tokenizer, messages).to(model.device)
     with torch.no_grad():
         output_ids = model.generate(
+            inputs,
             do_sample=True if temperature > 0 else False,
             temperature=temperature,
             top_p=top_p,
             max_new_tokens=max_new_tokens,
             pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+            use_cache=False,  # critical for compatibility with some remote-code cache implementations
         )
     text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    # Extract the last "Analysis:" + "Response:" sections
     analysis, response = "", ""
     a_idx = text.rfind("Analysis:")
     r_idx = text.rfind("Response:")
         response = text.strip()
     return analysis, response, text
+# -----------------------------
+# UI
+# -----------------------------
 PRESET_Q = "What are the health effects of coffee? Answer based on the context."
+PRESET_CTX = (
+    "Coffee contains caffeine, which can increase alertness. Excess intake may cause "
+    "jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
+)
 with gr.Blocks(title="Exoskeleton Reasoning — Appendix Prompt Demo") as demo:
+    gr.Markdown(
+        "# Exoskeleton Reasoning — Appendix-Style Prompt\n"
+        "The model must **prioritize the provided context**, and reply in plain text with two sections: **Analysis** and **Response**."
+    )
     with gr.Row():
         with gr.Column(scale=3):
             q = gr.Textbox(label="Client question", value=PRESET_Q, lines=4)
                 max_new = gr.Slider(64, 1024, value=MAX_NEW_TOKENS, step=16, label="Max new tokens")
                 model_id = gr.Textbox(label="Model ID", value=DEFAULT_MODEL)
             run = gr.Button("Run", variant="primary")
+            gr.Markdown(
+                'Secrets/vars: set **HF_TOKEN** if the model is gated · Override `EXOSKELETON_MODEL_ID` to change default.'
+            )
         with gr.Column(scale=4):
             with gr.Accordion("Analysis", open=True):
                 analysis_box = gr.Textbox(lines=6, label="Analysis (model)")
                 response_box = gr.Textbox(lines=6, label="Response (model)")
             with gr.Accordion("Raw output", open=False):
                 raw_box = gr.Textbox(lines=8, label="Raw text")
     def infer_fn(question, context, temperature, top_p, max_new_tokens, model_id):
+        if not question.strip() or not context.strip():
             gr.Warning("Please provide both a Client question and Context.")
             return "", "", ""
         a, r, raw = generate_text(question, context, temperature, top_p, max_new_tokens, model_id)
         return a, r, raw
+    run.click(fn=infer_fn, inputs=[q, ctx, temp, topp, max_new, model_id],
+              outputs=[analysis_box, response_box, raw_box])
 if __name__ == "__main__":
     demo.launch()