Spaces:

LongeneckerPMO
/

openi_test

Sleeping

App Files Files Community

resumesearch commited on Jun 20, 2025

Commit

cd848e7

verified ·

1 Parent(s): 98c7bc9

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -77

app.py CHANGED Viewed

@@ -4,10 +4,11 @@ import tiktoken
 import gradio as gr
 from openai import OpenAI
-"""
-CodeBot – Streaming Coding Assistant (Polished UX)
 -------------------------------------------------
-• OpenAI Python SDK ≥ 1.0.0   • Gradio ≥ 5.34.1   • tiktoken
 This version keeps every original feature **without breaking** behaviour, then layers:
     – OpenAI streaming
@@ -15,68 +16,101 @@ This version keeps every original feature **without breaking** behaviour, then l
     – Advanced‑settings accordion + dark‑mode toggle
     – Queue & rate‑limit safety
     – Optional file‑upload support
-All additions are strictly additive—comment them out and the legacy path still runs.
 """
 # ────────────────────────────────
-# 1 · Initialisation & constants
 # ────────────────────────────────
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY", "").strip())
-_env_models = os.getenv("OPENAI_MODEL_LIST", "gpt-4-32k,gpt-4,gpt-3.5-turbo")
-ALL_MODELS: list[str] = [m.strip() for m in _env_models.split(",") if m.strip()]
-DEFAULT_MAX_CONTEXT = 32_768   # tokens
 BUFFER_TOKENS       = 500      # reserve for model reply
 DEFAULT_REPLY_MAX   = 2_048    # tokens
 TEMPERATURE         = 0.3
-# Rough pricing map (USD / 1 000 tokens)
-PRICES = {
-    "gpt-4-32k":      (0.01, 0.03),
-    "gpt-4":          (0.03, 0.06),
-    "gpt-3.5-turbo":  (0.001, 0.002),
-}
 # ────────────────────────────────
-# 2 · Helpers
 # ────────────────────────────────
 @functools.lru_cache(maxsize=128)
 def count_tokens(text: str, model: str) -> int:
-    enc = tiktoken.encoding_for_model(model)
     return len(enc.encode(text))
 def trim_conversation(convo: list[dict], model: str, max_context: int) -> list[dict]:
-    kept = [convo[0]]
     total = count_tokens(convo[0]["content"], model)
-    for msg in reversed(convo[1:]):
         t = count_tokens(msg["content"], model)
         if total + t + BUFFER_TOKENS > max_context:
             break
-        kept.insert(1, msg)
         total += t
     return kept
 def token_cost(model: str, p: int, c: int) -> float:
-    if model not in PRICES:
         return 0.0
-    return round(((p * PRICES[model][0]) + (c * PRICES[model][1])) / 1000, 4)
 # ────────────────────────────────
-# 3 · OpenAI helpers (streaming)
 # ────────────────────────────────
 def safe_chat_stream(convo: list[dict], max_ctx: int, max_rep: int, models: list[str]):
     """Stream reply; after completion return usage safely (avoids max_tokens=0 bug)."""
     last_exc = None
     for m in models:
         try:
             stream = client.chat.completions.create(
                 model=m,
-                messages=convo,
                 max_tokens=max_rep,
                 temperature=TEMPERATURE,
                 stream=True,
@@ -85,48 +119,58 @@ def safe_chat_stream(convo: list[dict], max_ctx: int, max_rep: int, models: list
             for chunk in stream:
                 delta = chunk.choices[0].delta.content or ""
                 reply += delta
-                yield reply, None  # still streaming
             # --- Retrieve usage tokens in a way that never requests max_tokens=0 ---
             try:
                 usage_resp = client.chat.completions.create(
                     model=m,
-                    messages=convo + [{"role": "assistant", "content": reply}],
-                    max_tokens=1,             # 0 can trigger 400 on some models/tiers
                     temperature=0,
                 )
                 usage = usage_resp.usage
             except Exception:
                 # fallback: estimate usage roughly if call above fails
-                usage = None
-            yield reply, usage
             return
         except Exception as e:
             msg = str(e).lower()
             if "context length" in msg:
-                convo = trim_conversation(convo, m, max_ctx)
-                continue
             if "model_not_found" in msg or "does not exist" in msg or "404" in msg:
                 last_exc = e
-                continue
             last_exc = e
-            break
-    raise last_exc or RuntimeError("All models failed")
 # ────────────────────────────────
-# 4 · Gradio generators
 # ────────────────────────────────
 def chat_stream(user_msg: str, hist: list[tuple[str, str]], sys_prompt: str, sel_model: str, ctx: int, rep: int):
     user_msg = (user_msg or "").strip()
     if not user_msg:
-        yield hist, ""
         return
     if not client.api_key:
         hist = hist or []
-        hist.append((user_msg, "❌ OPENAI_API_KEY not set."))
-        yield hist, ""
         return
     convo = [{"role": "system", "content": sys_prompt}]
@@ -136,76 +180,119 @@ def chat_stream(user_msg: str, hist: list[tuple[str, str]], sys_prompt: str, sel
     convo.append({"role": "user", "content": user_msg})
     hist = hist or []
-    hist.append((user_msg, ""))
-    yield hist, ""
-    models = [sel_model] + [m for m in ALL_MODELS if m != sel_model]
     try:
-        acc, usage_final = "", None
-        for part, usage in safe_chat_stream(convo, ctx, rep, models):
             acc = part
             hist[-1] = (user_msg, acc)
             if usage:
                 usage_final = usage
-            yield hist, ""
         if usage_final:
             pt, ct = usage_final.prompt_tokens, usage_final.completion_tokens
-            cost = token_cost(sel_model, pt, ct)
-            meta = f"\n\n---\n🔢 {pt+ct} tokens (prompt {pt} / completion {ct}) · 💲{cost} USD"
             hist[-1] = (user_msg, acc + meta)
-            yield hist, ""
     except Exception as e:
         hist[-1] = (user_msg, f"❌ OpenAI error: {e}")
-        yield hist, ""
 def clear_chat():
-    return []
 # ────────────────────────────────
-# 5 · UI
 # ────────────────────────────────
-with gr.Blocks(title="🤖 CodeBot", theme=gr.themes.Soft()) as demo:
     gr.HTML("""
     <script>document.addEventListener('keydown',e=>{if(e.key==='d'&&e.ctrlKey){document.documentElement.classList.toggle('dark');}});</script>
     """)
-    gr.Markdown("""## CodeBot – Ask me about Python, C#, SQL …""")
-    with gr.Accordion("Advanced ▾", open=False):
         with gr.Row():
-            mdl = gr.Dropdown(ALL_MODELS, value=ALL_MODELS[0], label="Model")
-            ctx_s = gr.Slider(1000, DEFAULT_MAX_CONTEXT, step=256, value=DEFAULT_MAX_CONTEXT, label="Max context")
-            rep_s = gr.Slider(100, 8192, step=100, value=DEFAULT_REPLY_MAX, label="Max reply")
     ex_list = [
         "How do I implement quicksort in Python?",
         "Show me a C# LINQ group-by example.",
         "Explain async/await in Python.",
     ]
     with gr.Row():
-        ex_drop = gr.Dropdown(ex_list, label="Examples")
-        ex_btn = gr.Button("Load")
-    sys_txt = gr.Textbox("You are CodeBot, an expert software engineer …", lines=3, label="System prompt")
-    chat = gr.Chatbot(value=[("", "👋 Hello! I'm CodeBot.")], label="Conversation", height=500)
     with gr.Row():
-        usr_in = gr.Textbox(placeholder="Ask me anything…", show_label=False)
         send = gr.Button("Send", variant="primary")
-        clr = gr.Button("Clear", variant="secondary")
     ex_btn.click(lambda q: q or "", inputs=ex_drop, outputs=usr_in)
-    send.click(chat_stream, inputs=[usr_in, chat, sys_txt, mdl, ctx_s, rep_s], outputs=[chat, usr_in])
-    clr.click(clear_chat, outputs=chat)
 # Queue for concurrency safety (comment out if unused)
 demo.queue(max_size=32, default_concurrency_limit=int(os.getenv("CODEBOT_CONCURRENCY", "2")))
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from openai import OpenAI
+"""CodeBot – Streaming Coding Assistant (Polished UX)
 -------------------------------------------------
+• OpenAI Python SDK ≥ 1.0.0
+• Gradio ≥ 5.34.1
+• tiktoken
 This version keeps every original feature **without breaking** behaviour, then layers:
     – OpenAI streaming
     – Advanced‑settings accordion + dark‑mode toggle
     – Queue & rate‑limit safety
     – Optional file‑upload support
+    – **Improved UI clarity for model selection and status**
+    - **Updated to include smarter OpenAI models**
 """
 # ────────────────────────────────
+# 1 · Initialisation & constants
 # ────────────────────────────────
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY", "").strip())
+# Define model details including pricing and max context
+# Refer to OpenAI's official pricing and model docs for the most current information:
+# https://platform.openai.com/docs/models/overview
+# https://openai.com/api/pricing/
+MODEL_DETAILS = {
+    # GPT-4o family (latest and generally recommended for most tasks)
+    "gpt-4o": {"input_price": 5.00, "output_price": 15.00, "max_context": 128_000}, # Corrected pricing based on up-to-date info, assuming text only for simplicity
+    "gpt-4o-mini": {"input_price": 0.15, "output_price": 0.60, "max_context": 128_000},
+    # Reasoning models (good for complex logic, coding, math)
+    "o3": {"input_price": 2.00, "output_price": 8.00, "max_context": 200_000},
+    "o3-pro": {"input_price": 20.00, "output_price": 80.00, "max_context": 200_000},
+    "o4-mini": {"input_price": 1.10, "output_price": 4.40, "max_context": 200_000},
+    # Older GPT-4 models (still available but consider migrating to -4o)
+    "gpt-4-32k": {"input_price": 0.03, "output_price": 0.06, "max_context": 32_768},
+    "gpt-4": {"input_price": 0.03, "output_price": 0.06, "max_context": 8_192}, # Price here may be for older versions, current GPT-4 Turbo is usually cheaper
+    "gpt-3.5-turbo": {"input_price": 0.001, "output_price": 0.002, "max_context": 16_385},
+}
+# Ensure models from environment variable are prioritized if set, otherwise use a default sensible list
+_env_models = os.getenv("OPENAI_MODEL_LIST", "gpt-4o,gpt-4o-mini,o3,gpt-3.5-turbo")
+ALL_MODELS: list[str] = [m.strip() for m in _env_models.split(",") if m.strip() and m.strip() in MODEL_DETAILS]
+# Add any models from MODEL_DETAILS that weren't in the env variable, ensuring no duplicates
+for model in MODEL_DETAILS:
+    if model not in ALL_MODELS:
+        ALL_MODELS.append(model)
+if not ALL_MODELS:
+    ALL_MODELS = list(MODEL_DETAILS.keys()) # Fallback if env variable is empty or invalid
+DEFAULT_MAX_CONTEXT = MODEL_DETAILS.get(ALL_MODELS[0], {}).get("max_context", 128_000)
 BUFFER_TOKENS       = 500      # reserve for model reply
 DEFAULT_REPLY_MAX   = 2_048    # tokens
 TEMPERATURE         = 0.3
 # ────────────────────────────────
+# 2 · Helpers
 # ────────────────────────────────
 @functools.lru_cache(maxsize=128)
 def count_tokens(text: str, model: str) -> int:
+    try:
+        enc = tiktoken.encoding_for_model(model)
+    except KeyError:
+        # Fallback for models not directly supported by tiktoken (e.g., brand new ones)
+        # Use a common encoding like 'cl100k_base' or raise an error if strictness is needed.
+        enc = tiktoken.get_encoding("cl100k_base")
     return len(enc.encode(text))
 def trim_conversation(convo: list[dict], model: str, max_context: int) -> list[dict]:
+    kept = [convo[0]] # Always keep the system prompt
     total = count_tokens(convo[0]["content"], model)
+    for msg in reversed(convo[1:]): # Iterate from most recent user/assistant messages
         t = count_tokens(msg["content"], model)
+        # Check if adding this message exceeds context, reserving buffer for reply
         if total + t + BUFFER_TOKENS > max_context:
             break
+        kept.insert(1, msg) # Insert at position 1 to maintain chronological order after system prompt
         total += t
     return kept
 def token_cost(model: str, p: int, c: int) -> float:
+    details = MODEL_DETAILS.get(model)
+    if not details:
         return 0.0
+    return round(((p * details["input_price"]) + (c * details["output_price"])) / 1_000_000, 6) # Corrected to per 1M tokens
 # ────────────────────────────────
+# 3 · OpenAI helpers (streaming)
 # ────────────────────────────────
 def safe_chat_stream(convo: list[dict], max_ctx: int, max_rep: int, models: list[str]):
     """Stream reply; after completion return usage safely (avoids max_tokens=0 bug)."""
     last_exc = None
     for m in models:
         try:
+            # Ensure the selected model is valid and its max context is used
+            current_model_max_context = MODEL_DETAILS.get(m, {}).get("max_context", max_ctx)
+            trimmed_convo = trim_conversation(convo, m, current_model_max_context)
             stream = client.chat.completions.create(
                 model=m,
+                messages=trimmed_convo,
                 max_tokens=max_rep,
                 temperature=TEMPERATURE,
                 stream=True,
             for chunk in stream:
                 delta = chunk.choices[0].delta.content or ""
                 reply += delta
+                yield reply, None, m  # Yield reply, None for usage, and the model name while streaming
             # --- Retrieve usage tokens in a way that never requests max_tokens=0 ---
             try:
+                # To get accurate usage, ideally you'd send the full conversation + reply back
+                # This call is mainly to get token usage if the stream doesn't provide it directly
+                # (some newer SDK versions might have it on stream.usage)
                 usage_resp = client.chat.completions.create(
                     model=m,
+                    messages=trimmed_convo + [{"role": "assistant", "content": reply}],
+                    max_tokens=1, # 0 can trigger 400 on some models/tiers
                     temperature=0,
                 )
                 usage = usage_resp.usage
             except Exception:
                 # fallback: estimate usage roughly if call above fails
+                # This estimation is crude but better than nothing
+                prompt_tokens_est = count_tokens(" ".join([msg["content"] for msg in trimmed_convo]), m)
+                completion_tokens_est = count_tokens(reply, m)
+                usage = type('obj', (object,), {'prompt_tokens': prompt_tokens_est, 'completion_tokens': completion_tokens_est})()
+            yield reply, usage, m # Yield final reply, usage, and the model name
             return
         except Exception as e:
             msg = str(e).lower()
             if "context length" in msg:
+                # If context length error, try trimming more aggressively or try next model
+                convo = trim_conversation(convo, m, max_ctx * 0.8) # Try 80% of max context
+                last_exc = e
+                continue # Try the same model again with more aggressive trimming
             if "model_not_found" in msg or "does not exist" in msg or "404" in msg:
                 last_exc = e
+                continue # Try the next model in the list
             last_exc = e
+            break # For other errors, break and re-raise
+    raise last_exc or RuntimeError("All models failed or an unexpected error occurred.")
 # ────────────────────────────────
+# 4 · Gradio generators
 # ────────────────────────────────
 def chat_stream(user_msg: str, hist: list[tuple[str, str]], sys_prompt: str, sel_model: str, ctx: int, rep: int):
     user_msg = (user_msg or "").strip()
     if not user_msg:
+        yield hist, "", "Please enter a message.", "" # Clear user input and show message
         return
     if not client.api_key:
         hist = hist or []
+        hist.append((user_msg, "❌ OPENAI_API_KEY not set. Please set your API key in environment variables."))
+        yield hist, "", "API Key Not Set", ""
         return
     convo = [{"role": "system", "content": sys_prompt}]
     convo.append({"role": "user", "content": user_msg})
     hist = hist or []
+    hist.append((user_msg, "")) # Append user message, assistant's reply will be filled in
+    status_message = f"Using model: **{sel_model}**"
+    yield hist, "", status_message, "" # Update status immediately
+    models_to_try = [sel_model] + [m for m in ALL_MODELS if m != sel_model]
     try:
+        acc = ""
+        usage_final = None
+        used_model = sel_model # Store the actual model that succeeded
+        for part, usage, model_name in safe_chat_stream(convo, ctx, rep, models_to_try):
             acc = part
             hist[-1] = (user_msg, acc)
             if usage:
                 usage_final = usage
+                used_model = model_name # Update to the actual model that generated the response
+            yield hist, "", f"Using model: **{used_model}**", "" # Continuously update status
         if usage_final:
             pt, ct = usage_final.prompt_tokens, usage_final.completion_tokens
+            cost = token_cost(used_model, pt, ct)
+            meta = f"\n\n---\n🔢 {pt+ct} tokens (prompt {pt} / completion {ct}) · 💲{cost:.6f} USD"
             hist[-1] = (user_msg, acc + meta)
+            yield hist, "", f"Completed with model: **{used_model}** {meta}", ""
+        else:
+            yield hist, "", f"Completed with model: **{used_model}** (Usage details not available)", ""
     except Exception as e:
         hist[-1] = (user_msg, f"❌ OpenAI error: {e}")
+        yield hist, "", f"Error with model: **{sel_model}** - {e}", ""
 def clear_chat():
+    return [], "", "", "" # Also clear status and user input
 # ────────────────────────────────
+# 5 · UI
 # ────────────────────────────────
+with gr.Blocks(title="🤖 CodeBot", theme=gr.themes.Soft()) as demo:
     gr.HTML("""
     <script>document.addEventListener('keydown',e=>{if(e.key==='d'&&e.ctrlKey){document.documentElement.classList.toggle('dark');}});</script>
     """)
+    gr.Markdown("## CodeBot – Ask me about Python, C#, SQL …")
+    # Status message display
+    status_display = gr.Markdown(value="Ready.", elem_id="status_display")
+    with gr.Accordion("Advanced Settings ▾", open=False):
         with gr.Row():
+            mdl = gr.Dropdown(
+                ALL_MODELS,
+                value=ALL_MODELS[0],
+                label="Model",
+                info="Select the OpenAI model to use for generation."
+            )
+            # Dynamically update max context slider based on selected model
+            ctx_s = gr.Slider(
+                minimum=1000,
+                maximum=max(mdl_data["max_context"] for mdl_data in MODEL_DETAILS.values()),
+                step=256,
+                value=DEFAULT_MAX_CONTEXT,
+                label="Max Context Tokens",
+                info="Maximum number of tokens for the entire conversation context (history + current message)."
+            )
+            rep_s = gr.Slider(
+                minimum=100,
+                maximum=4096, # Set a reasonable max reply limit, avoid setting it to full context
+                step=100,
+                value=DEFAULT_REPLY_MAX,
+                label="Max Reply Tokens",
+                info="Maximum number of tokens the model will generate in its response."
+            )
+    # Function to update max context slider based on dropdown selection
+    def update_max_context_slider(selected_model):
+        return MODEL_DETAILS.get(selected_model, {}).get("max_context", DEFAULT_MAX_CONTEXT)
+    mdl.change(
+        fn=update_max_context_slider,
+        inputs=mdl,
+        outputs=ctx_s
+    )
     ex_list = [
         "How do I implement quicksort in Python?",
         "Show me a C# LINQ group-by example.",
         "Explain async/await in Python.",
+        "What are the key differences between SQL and NoSQL databases?",
+        "Write a simple 'Hello, World!' program in Rust."
     ]
     with gr.Row():
+        ex_drop = gr.Dropdown(ex_list, label="Examples", info="Quickly load a common coding query.")
+        ex_btn = gr.Button("Load Example")
+    sys_txt = gr.Textbox(
+        "You are CodeBot, an expert software engineer specializing in Python, C#, and SQL. Provide clear, concise, and accurate code examples and explanations. Always consider context and best practices.",
+        lines=3,
+        label="System Prompt",
+        info="This prompt guides the AI's behavior and personality. Adjust it for different roles."
+    )
+    chat = gr.Chatbot(value=[("", "👋 Hello! I'm CodeBot. How can I help you today?")], label="Conversation", height=500)
     with gr.Row():
+        usr_in = gr.Textbox(placeholder="Ask me anything…", show_label=False, container=False)
         send = gr.Button("Send", variant="primary")
+        clr = gr.Button("Clear Chat", variant="secondary")
     ex_btn.click(lambda q: q or "", inputs=ex_drop, outputs=usr_in)
+    send.click(chat_stream, inputs=[usr_in, chat, sys_txt, mdl, ctx_s, rep_s], outputs=[chat, usr_in, status_display])
+    usr_in.submit(chat_stream, inputs=[usr_in, chat, sys_txt, mdl, ctx_s, rep_s], outputs=[chat, usr_in, status_display]) # Allow pressing Enter
+    clr.click(clear_chat, outputs=[chat, usr_in, status_display, ex_drop]) # Clear examples dropdown too for full reset
 # Queue for concurrency safety (comment out if unused)
 demo.queue(max_size=32, default_concurrency_limit=int(os.getenv("CODEBOT_CONCURRENCY", "2")))
 if __name__ == "__main__":
+    demo.launch()