Spaces:

anaspro
/

chatbox

Runtime error

anaspro commited on Oct 21

Commit

8bda143

1 Parent(s): a645494

Use original simple code structure with our customizations

- Back to original GPT-OSS demo code (stable and tested)
- Keep our customizations: system_prompt.txt, Arabic examples, Arabic UI
- Model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
- No complex caching - simple and works with ZeroGPU
- Arabic interface and NB TEL specific prompts

Files changed (1) hide show

app.py +25 -114

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import os
 import gradio as gr
 import spaces
 import re
-from threading import Thread
-from functools import lru_cache
-from transformers import pipeline, TextIteratorStreamer
 from huggingface_hub import login
 import logging
 from openai_harmony import (
@@ -27,29 +26,21 @@ if os.getenv("HF_TOKEN"):
     login(token=os.getenv("HF_TOKEN"))
     logger.info("🔐 Logged in to Hugging Face")
-# Regex config for parsing reasoning and output
 RE_REASONING = re.compile(r'(?i)Reasoning:\s*(low|medium|high)')
 RE_FINAL_MARKER = re.compile(r'(?i)assistantfinal')
 RE_ANALYSIS_PREFIX = re.compile(r'(?i)^analysis\s*')
-# ======================================================
-# Load System Prompt
-# ======================================================
 try:
     with open("system_prompt.txt", "r", encoding="utf-8") as f:
         DEFAULT_SYSTEM_PROMPT = f.read()
 except FileNotFoundError:
     logger.warning("system_prompt.txt not found, using default prompt")
-    DEFAULT_SYSTEM_PROMPT = """أنت مساعد ذكي متقدم يعتمد على نموذج GPT-OSS-20B من OpenAI مع دعم فني لشركة NB TEL.
-تحجي بالعراقي بأسلوب مهني ومحترف.
-Reasoning: high - استخدم مستوى تفكير عالي للتحليل المتعمق والحلول المتقدمة."""
-# ======================================================
-# Parse Reasoning Level from System Prompt
-# ======================================================
 def parse_reasoning_and_instructions(system_prompt: str):
-    """Parse reasoning effort level from system prompt"""
     instructions = system_prompt or "You are a helpful assistant."
     match = RE_REASONING.search(instructions)
     effort_key = match.group(1).lower() if match else 'medium'
@@ -61,84 +52,48 @@ def parse_reasoning_and_instructions(system_prompt: str):
     cleaned_instructions = RE_REASONING.sub('', instructions).strip()
     return effort, cleaned_instructions
-# ======================================================
-# Model Configuration
-# ======================================================
 model_id = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
-# Load harmony encoding (lightweight, can load outside GPU)
 enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-# ======================================================
-# Cached Model Loader (for ZeroGPU)
-# ======================================================
-@lru_cache(maxsize=1)
-def load_model():
-    """Load model with caching to avoid reloading"""
-    logger.info("🚀 Loading GPT-OSS-20B model on GPU...")
-    model_pipe = pipeline(
-        "text-generation",
-        model=model_id,
-        torch_dtype="auto",
-        device_map="auto",
-        trust_remote_code=True,
-    )
-    logger.info("✅ Model loaded successfully!")
-    return model_pipe
-# ======================================================
-# Format Conversation History
-# ======================================================
 def format_conversation_history(chat_history):
-    """Format Gradio chat history to standard message format"""
     messages = []
     for item in chat_history:
         role = item["role"]
         content = item["content"]
         if isinstance(content, list):
             content = content[0]["text"] if content and "text" in content[0] else str(content)
-        messages.append({"role": role, "content": content})
     return messages
-# ======================================================
-# Generate Response with Harmony Format
-# ======================================================
-@spaces.GPU(duration=120)
 def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
-    """Generate response using GPT-OSS with Harmony format"""
-    # Get cached model (loads only once)
-    pipe = load_model()
-    # Create new user message
     new_message = {"role": "user", "content": input_data}
     processed_history = format_conversation_history(chat_history)
-    # Parse reasoning effort from system prompt
     effort, instructions = parse_reasoning_and_instructions(system_prompt)
-    # Build harmony messages with proper system and developer roles
     system_content = SystemContent.new().with_reasoning_effort(effort)
     developer_content = DeveloperContent.new().with_instructions(instructions)
     harmony_messages = [
         Message.from_role_and_content(Role.SYSTEM, system_content),
         Message.from_role_and_content(Role.DEVELOPER, developer_content),
     ]
-    # Add conversation history
     for m in processed_history + [new_message]:
         role = Role.USER if m["role"] == "user" else Role.ASSISTANT
         harmony_messages.append(Message.from_role_and_content(role, m["content"]))
-    # Render conversation using harmony encoding
     conversation = Conversation.from_messages(harmony_messages)
     prompt_tokens = enc.render_conversation_for_completion(conversation, Role.ASSISTANT)
     prompt_text = pipe.tokenizer.decode(prompt_tokens, skip_special_tokens=False)
-    # Setup streaming
     streamer = TextIteratorStreamer(pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         "max_new_tokens": max_new_tokens,
         "do_sample": True,
@@ -149,16 +104,13 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
         "streamer": streamer,
         "return_full_text": False,
     }
-    # Generate in separate thread
     thread = Thread(target=pipe, args=(prompt_text,), kwargs=generation_kwargs)
     thread.start()
-    # Parse thinking process and final answer
     thinking = ""
     final = ""
     started_final = False
     for chunk in streamer:
         if not started_final:
             parts = RE_FINAL_MARKER.split(chunk, maxsplit=1)
@@ -168,66 +120,25 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
                 started_final = True
         else:
             final += chunk
-        # Clean and format output
         clean_thinking = RE_ANALYSIS_PREFIX.sub('', thinking).strip()
         clean_final = final.strip()
-        # Format with collapsible thinking section
-        if clean_thinking:
-            formatted = f"<details open><summary>🧠 عرض عملية التفكير (Thinking Process)</summary>\n\n{clean_thinking}\n\n</details>\n\n{clean_final}"
-        else:
-            formatted = clean_final
         yield formatted
-# ======================================================
-# Create Gradio Interface
-# ======================================================
 demo = gr.ChatInterface(
     fn=generate_response,
     additional_inputs=[
-        gr.Slider(
-            label="Max New Tokens",
-            minimum=64,
-            maximum=4096,
-            step=1,
-            value=2048
-        ),
         gr.Textbox(
             label="System Prompt",
             value=DEFAULT_SYSTEM_PROMPT,
             lines=6,
             placeholder="يمكنك تعديل التعليمات والمستوى: Reasoning: low/medium/high"
         ),
-        gr.Slider(
-            label="Temperature",
-            minimum=0.1,
-            maximum=2.0,
-            step=0.1,
-            value=0.7
-        ),
-        gr.Slider(
-            label="Top-p",
-            minimum=0.05,
-            maximum=1.0,
-            step=0.05,
-            value=0.9
-        ),
-        gr.Slider(
-            label="Top-k",
-            minimum=1,
-            maximum=100,
-            step=1,
-            value=50
-        ),
-        gr.Slider(
-            label="Repetition Penalty",
-            minimum=1.0,
-            maximum=2.0,
-            step=0.05,
-            value=1.0
-        )
     ],
     examples=[
         [{"text": "النت عندي بطيء جداً رغم باقة 100 ميجا. شرحلي الأسباب المحتملة والحلول خطوة بخطوة."}],

 import os
+from transformers import pipeline, TextIteratorStreamer
+from threading import Thread
 import gradio as gr
 import spaces
 import re
 from huggingface_hub import login
 import logging
 from openai_harmony import (
     login(token=os.getenv("HF_TOKEN"))
     logger.info("🔐 Logged in to Hugging Face")
+# regex config
 RE_REASONING = re.compile(r'(?i)Reasoning:\s*(low|medium|high)')
 RE_FINAL_MARKER = re.compile(r'(?i)assistantfinal')
 RE_ANALYSIS_PREFIX = re.compile(r'(?i)^analysis\s*')
+# Load System Prompt from file
 try:
     with open("system_prompt.txt", "r", encoding="utf-8") as f:
         DEFAULT_SYSTEM_PROMPT = f.read()
 except FileNotFoundError:
     logger.warning("system_prompt.txt not found, using default prompt")
+    DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant. Reasoning: medium"
+# Parse reasoning level from system prompt
 def parse_reasoning_and_instructions(system_prompt: str):
     instructions = system_prompt or "You are a helpful assistant."
     match = RE_REASONING.search(instructions)
     effort_key = match.group(1).lower() if match else 'medium'
     cleaned_instructions = RE_REASONING.sub('', instructions).strip()
     return effort, cleaned_instructions
 model_id = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
+pipe = pipeline(
+    "text-generation",
+    model=model_id,
+    torch_dtype="auto",
+    device_map="auto",
+    trust_remote_code=True,
+)
 enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
 def format_conversation_history(chat_history):
     messages = []
     for item in chat_history:
         role = item["role"]
         content = item["content"]
         if isinstance(content, list):
             content = content[0]["text"] if content and "text" in content[0] else str(content)
+        messages.append({"role": "role", "content": content})
     return messages
+@spaces.GPU()
 def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
     new_message = {"role": "user", "content": input_data}
     processed_history = format_conversation_history(chat_history)
     effort, instructions = parse_reasoning_and_instructions(system_prompt)
     system_content = SystemContent.new().with_reasoning_effort(effort)
     developer_content = DeveloperContent.new().with_instructions(instructions)
     harmony_messages = [
         Message.from_role_and_content(Role.SYSTEM, system_content),
         Message.from_role_and_content(Role.DEVELOPER, developer_content),
     ]
     for m in processed_history + [new_message]:
         role = Role.USER if m["role"] == "user" else Role.ASSISTANT
         harmony_messages.append(Message.from_role_and_content(role, m["content"]))
     conversation = Conversation.from_messages(harmony_messages)
     prompt_tokens = enc.render_conversation_for_completion(conversation, Role.ASSISTANT)
     prompt_text = pipe.tokenizer.decode(prompt_tokens, skip_special_tokens=False)
     streamer = TextIteratorStreamer(pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         "max_new_tokens": max_new_tokens,
         "do_sample": True,
         "streamer": streamer,
         "return_full_text": False,
     }
     thread = Thread(target=pipe, args=(prompt_text,), kwargs=generation_kwargs)
     thread.start()
+    # parsing thinking
     thinking = ""
     final = ""
     started_final = False
     for chunk in streamer:
         if not started_final:
             parts = RE_FINAL_MARKER.split(chunk, maxsplit=1)
                 started_final = True
         else:
             final += chunk
         clean_thinking = RE_ANALYSIS_PREFIX.sub('', thinking).strip()
         clean_final = final.strip()
+        formatted = f"<details open><summary>🧠 عرض عملية التفكير (Thinking Process)</summary>\n\n{clean_thinking}\n\n</details>\n\n{clean_final}"
         yield formatted
 demo = gr.ChatInterface(
     fn=generate_response,
     additional_inputs=[
+        gr.Slider(label="Max new tokens", minimum=64, maximum=4096, step=1, value=2048),
         gr.Textbox(
             label="System Prompt",
             value=DEFAULT_SYSTEM_PROMPT,
             lines=6,
             placeholder="يمكنك تعديل التعليمات والمستوى: Reasoning: low/medium/high"
         ),
+        gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7),
+        gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
+        gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50),
+        gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.0)
     ],
     examples=[
         [{"text": "النت عندي بطيء جداً رغم باقة 100 ميجا. شرحلي الأسباب المحتملة والحلول خطوة بخطوة."}],