CPU-LLM-Inference

Running

App Files Files Community

R-Kentaren commited on about 14 hours ago

Commit

0ff1c7b

verified ·

1 Parent(s): 7fa6b40

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -94

app.py CHANGED Viewed

@@ -5,20 +5,18 @@ import sys
 import threading
 from itertools import islice
 from datetime import datetime
-import re  # for parsing <think> blocks
 import gradio as gr
 import torch
-from transformers import pipeline, TextIteratorStreamer, StoppingCriteria
 from transformers import AutoTokenizer
 from ddgs import DDGS
-from torch.utils._pytree import tree_map
-from config import *
 # Global event to signal cancellation from the UI thread to the generation thread
 cancel_event = threading.Event()
-access_token=os.environ['HF_TOKEN']
 # Global cache for pipelines to avoid re-loading.
 PIPELINES = {}
@@ -32,8 +30,7 @@ def load_pipeline(model_name):
     if model_name in PIPELINES:
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
-    tokenizer = AutoTokenizer.from_pretrained(repo,
-                token=access_token)
     for dtype in (torch.bfloat16, torch.float16, torch.float32):
         try:
             pipe = pipeline(
@@ -41,9 +38,9 @@ def load_pipeline(model_name):
                 model=repo,
                 tokenizer=tokenizer,
                 trust_remote_code=True,
-                dtype=dtype, # Use `dtype` instead of deprecated `torch_dtype`
                 device_map="auto",
-                use_cache=True,      # Enable past-key-value caching
                 token=access_token)
             PIPELINES[model_name] = pipe
             return pipe
@@ -61,7 +58,6 @@ def load_pipeline(model_name):
     PIPELINES[model_name] = pipe
     return pipe
 def retrieve_context(query, max_results=6, max_chars=50):
     """
     Retrieve search snippets from DuckDuckGo (runs in background).
@@ -91,20 +87,23 @@ def format_conversation(history, system_prompt, tokenizer):
         return prompt
 def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout):
-    # Get model size from the MODELS dict (more reliable than string parsing)
-    model_size = MODELS[model_name].get("params_b", 4.0)  # Default to 4B if not found
     # Only use AOT for models >= 2B parameters
     use_aot = model_size >= 2
-    # Adjusted for H200 performance: faster inference, quicker compilation
-    base_duration = 20 if not use_aot else 40  # Reduced base times
-    token_duration = max_tokens * 0.005  # ~200 tokens/second average on H200
-    search_duration = 10 if enable_search else 0  # Reduced search time
-    aot_compilation_buffer = 20 if use_aot else 0  # Faster compilation on H200
     return base_duration + token_duration + search_duration + aot_compilation_buffer
 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
@@ -135,71 +134,53 @@ def chat_response(user_msg, chat_history, system_prompt,
     else:
         debug = 'Web search disabled.'
     try:
         cur_date = datetime.now().strftime('%Y-%m-%d')
-        # merge any fetched search results into the system prompt
-    if search_results:
-        enriched = system_prompt.strip() + f"""
-        # SEARCH CONTEXT (TRUSTED SOURCES ONLY)
-        Below are web search results. Treat them as the ONLY source of truth for answering.
-        {search_results}
-        RULES (VERY IMPORTANT):
-        - Do NOT use outside knowledge. Do NOT guess or fill missing information.
-        - If the answer is not clearly supported by the search results, say: "Not enough information in the provided sources."
-        - Every factual statement must be directly supported by at least one citation [citation:X].
-        - Do NOT add explanations, examples, or background that are not explicitly present in the sources.
-        - Do NOT paraphrase beyond what is necessary for clarity.
-        - If sources conflict, mention the conflict and cite both.
-        - If multiple sources are used, distribute citations per sentence, not only at the end.
-        CITATION RULES:
-        - Use inline citations like this: [citation:1]
-        - If multiple sources support a sentence: [citation:1][citation:3]
-        - Never place all citations only at the end.
-        ANSWER POLICY:
-        - Be concise and strictly grounded.
-        - No speculation, no assumptions, no "likely", no "probably".
-        - If the user requests a list, only include items explicitly found in sources.
-        - If sources are insufficient, stop and ask for more data instead of guessing.
-        DATE CONTEXT:
-        - Today is {cur_date} (use only for time reference, not for assumptions).
-        USER QUESTION:
-        """
-    else:
-        enriched = system_prompt
-        # wait up to 1s for snippets, then replace debug with them
-        if enable_search:
-            thread_search.join(timeout=float(search_timeout))
-            if search_results:
-                debug = "### Search results merged into prompt\n\n" + "\n".join(
-                    f"- {r}" for r in search_results
-                )
-            else:
-                debug = "*No web search results found.*"
-        # merge fetched snippets into the system prompt
         if search_results:
-            enriched = system_prompt.strip() + \
-            f'''\n# The following contents are the search results related to the user's message:
-            {search_results}
-            In the search results I provide to you, each result is formatted as [webpage X begin]...[webpage X end], where X represents the numerical index of each article. Please cite the context at the end of the relevant sentence when appropriate. Use the citation format [citation:X] in the corresponding part of your answer. If a sentence is derived from multiple contexts, list all relevant citation numbers, such as [citation:3][citation:5]. Be sure not to cluster all citations at the end; instead, include them in the corresponding parts of the answer.
-            When responding, please keep the following points in mind:
-            - Today is {cur_date}.
-            - Not all content in the search results is closely related to the user's question. You need to evaluate and filter the search results based on the question.
-            - For listing-type questions (e.g., listing all flight information), try to limit the answer to 10 key points and inform the user that they can refer to the search sources for complete information. Prioritize providing the most complete and relevant items in the list. Avoid mentioning content not provided in the search results unless necessary.
-            - For creative tasks (e.g., writing an essay), ensure that references are cited within the body of the text, such as [citation:3][citation:5], rather than only at the end of the text. You need to interpret and summarize the user's requirements, choose an appropriate format, fully utilize the search results, extract key information, and generate an answer that is insightful, creative, and professional. Extend the length of your response as much as possible, addressing each point in detail and from multiple perspectives, ensuring the content is rich and thorough.
-            - If the response is lengthy, structure it well and summarize it in paragraphs. If a point-by-point format is needed, try to limit it to 5 points and merge related content.
-            - For objective Q&A, if the answer is very brief, you may add one or two related sentences to enrich the content.
-            - Choose an appropriate and visually appealing format for your response based on the user's requirements and the content of the answer, ensuring strong readability.
-            - Your answer should synthesize information from multiple relevant webpages and avoid repeatedly citing the same webpage.
-            - Unless the user requests otherwise, your response should be in the same language as the user's question.
-            # The user's message is:
-            '''
         else:
-            enriched = system_prompt
         pipe = load_pipeline(model_name)
@@ -288,7 +269,6 @@ def chat_response(user_msg, chat_history, system_prompt,
     except GeneratorExit:
         # Handle cancellation gracefully
         print("Chat response cancelled.")
-        # Don't yield anything - let the cancellation propagate
         return
     except Exception as e:
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
@@ -296,7 +276,6 @@ def chat_response(user_msg, chat_history, system_prompt,
     finally:
         gc.collect()
 def update_default_prompt(enable_search):
     return f"You are a helpful assistant."
@@ -307,7 +286,7 @@ def update_duration_estimate(model_name, enable_search, max_results, max_chars,
         duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
                               enable_search, max_results, max_chars, model_name,
                               max_tokens, 0.7, 40, 0.9, 1.2, search_timeout)
-        model_size = MODELS[model_name].get("params_b", 4.0)
         return (f"⏱️ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
                 f"📊 **Model Size:** {model_size:.1f}B parameters\n"
                 f"🔍 **Web Search:** {'Enabled' if enable_search else 'Disabled'}")
@@ -355,7 +334,7 @@ with gr.Blocks(
                     value=False,
                     info="Augment responses with real-time web data"
                 )
-                sys_prompt = gr.Textbox(label="📝 System Prompt", lines=3, value=update_default_prompt(search_chk.value), placeholder="Define the assistant's behavior and personality...")
             # Duration Estimate
             duration_display = gr.Markdown(
@@ -479,14 +458,10 @@ with gr.Blocks(
         It uses a try...finally block to ensure the UI is always reset.
         """
         if not user_msg.strip():
-            # If the message is empty, do nothing.
-            # We yield an empty dict to avoid any state changes.
             yield {}
             return
-        # 1. Update UI to "generating" state.
-        #    Crucially, we do NOT update the `chat` component here, as the backend
-        #    will provide the correctly formatted history in the first response chunk.
         yield {
             txt: gr.update(value="", interactive=False),
             submit_btn: gr.update(interactive=False),
@@ -495,7 +470,6 @@ with gr.Blocks(
         cancelled = False
         try:
-            # 2. Call the backend and stream updates
             backend_args = [user_msg, chat_history] + list(args)
             for response_chunk in chat_response(*backend_args):
                 yield {
@@ -503,20 +477,17 @@ with gr.Blocks(
                     dbg: response_chunk[1],
                 }
         except GeneratorExit:
-            # Mark as cancelled and re-raise to prevent "generator ignored GeneratorExit"
             cancelled = True
             print("Generation cancelled by user.")
             raise
         except Exception as e:
             print(f"An error occurred during generation: {e}")
-            # If an error happens, add it to the chat history to inform the user.
             error_history = (chat_history or []) + [
                 {'role': 'user', 'content': user_msg},
                 {'role': 'assistant', 'content': f"**An error occurred:** {str(e)}"}
             ]
             yield {chat: error_history}
         finally:
-            # Only reset UI if not cancelled (to avoid "generator ignored GeneratorExit")
             if not cancelled:
                 print("Resetting UI state.")
                 yield {
@@ -532,7 +503,7 @@ with gr.Blocks(
     def reset_ui_after_cancel():
         """Reset UI components after cancellation."""
-        cancel_event.clear()  # Clear the flag for next generation
         print("UI reset after cancellation.")
         return {
             txt: gr.update(interactive=True),
@@ -553,7 +524,6 @@ with gr.Blocks(
     )
     # Event for the "Cancel" button.
-    # It sets the cancel flag, cancels the submit event, then resets the UI.
     cancel_btn.click(
         fn=set_cancel_flag,
         cancels=[submit_event]

 import threading
 from itertools import islice
 from datetime import datetime
+import re
 import gradio as gr
 import torch
+from transformers import pipeline, TextIteratorStreamer
 from transformers import AutoTokenizer
 from ddgs import DDGS
+from config import MODELS  # Import from config file
 # Global event to signal cancellation from the UI thread to the generation thread
 cancel_event = threading.Event()
+access_token = os.environ.get('HF_TOKEN', '')
 # Global cache for pipelines to avoid re-loading.
 PIPELINES = {}
     if model_name in PIPELINES:
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
+    tokenizer = AutoTokenizer.from_pretrained(repo, token=access_token)
     for dtype in (torch.bfloat16, torch.float16, torch.float32):
         try:
             pipe = pipeline(
                 model=repo,
                 tokenizer=tokenizer,
                 trust_remote_code=True,
+                dtype=dtype,
                 device_map="auto",
+                use_cache=True,
                 token=access_token)
             PIPELINES[model_name] = pipe
             return pipe
     PIPELINES[model_name] = pipe
     return pipe
 def retrieve_context(query, max_results=6, max_chars=50):
     """
     Retrieve search snippets from DuckDuckGo (runs in background).
         return prompt
 def get_duration(user_msg, chat_history, system_prompt, enable_search, max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty, search_timeout):
+    # Get model size from the MODELS dict
+    model_size = MODELS[model_name].get("params_b", 4.0)
     # Only use AOT for models >= 2B parameters
     use_aot = model_size >= 2
+    # Adjusted for H200 performance
+    base_duration = 20 if not use_aot else 40
+    token_duration = max_tokens * 0.005
+    search_duration = 10 if enable_search else 0
+    aot_compilation_buffer = 20 if use_aot else 0
     return base_duration + token_duration + search_duration + aot_compilation_buffer
+def get_model_size(model_name):
+    """Get model size from the MODELS dict."""
+    return MODELS.get(model_name, {}).get("params_b", 4.0)
 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
     else:
         debug = 'Web search disabled.'
+    # Wait for search results if enabled
+    if enable_search:
+        thread_search.join(timeout=float(search_timeout))
+        if search_results:
+            debug = "### Search results merged into prompt\n\n" + "\n".join(
+                f"- {r}" for r in search_results
+            )
+        else:
+            debug = "*No web search results found.*"
     try:
         cur_date = datetime.now().strftime('%Y-%m-%d')
+        # Prepare enriched system prompt
         if search_results:
+            enriched = system_prompt.strip() + f"""
+# SEARCH CONTEXT (TRUSTED SOURCES ONLY)
+Below are web search results. Treat them as the ONLY source of truth for answering.
+{search_results}
+RULES (VERY IMPORTANT):
+- Do NOT use outside knowledge. Do NOT guess or fill missing information.
+- If the answer is not clearly supported by the search results, say: "Not enough information in the provided sources."
+- Every factual statement must be directly supported by at least one citation [citation:X].
+- Do NOT add explanations, examples, or background that are not explicitly present in the sources.
+- Do NOT paraphrase beyond what is necessary for clarity.
+- If sources conflict, mention the conflict and cite both.
+- If multiple sources are used, distribute citations per sentence, not only at the end.
+CITATION RULES:
+- Use inline citations like this: [citation:1]
+- If multiple sources support a sentence: [citation:1][citation:3]
+- Never place all citations only at the end.
+ANSWER POLICY:
+- Be concise and strictly grounded.
+- No speculation, no assumptions, no "likely", no "probably".
+- If the user requests a list, only include items explicitly found in sources.
+- If sources are insufficient, stop and ask for more data instead of guessing.
+DATE CONTEXT:
+- Today is {cur_date} (use only for time reference, not for assumptions).
+USER QUESTION:
+"""
         else:
+            enriched = system_prompt.strip()
         pipe = load_pipeline(model_name)
     except GeneratorExit:
         # Handle cancellation gracefully
         print("Chat response cancelled.")
         return
     except Exception as e:
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
     finally:
         gc.collect()
 def update_default_prompt(enable_search):
     return f"You are a helpful assistant."
         duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
                               enable_search, max_results, max_chars, model_name,
                               max_tokens, 0.7, 40, 0.9, 1.2, search_timeout)
+        model_size = get_model_size(model_name)
         return (f"⏱️ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
                 f"📊 **Model Size:** {model_size:.1f}B parameters\n"
                 f"🔍 **Web Search:** {'Enabled' if enable_search else 'Disabled'}")
                     value=False,
                     info="Augment responses with real-time web data"
                 )
+                sys_prompt = gr.Textbox(label="📝 System Prompt", lines=3, value=update_default_prompt(False), placeholder="Define the assistant's behavior and personality...")
             # Duration Estimate
             duration_display = gr.Markdown(
         It uses a try...finally block to ensure the UI is always reset.
         """
         if not user_msg.strip():
             yield {}
             return
+        # Update UI to "generating" state
         yield {
             txt: gr.update(value="", interactive=False),
             submit_btn: gr.update(interactive=False),
         cancelled = False
         try:
             backend_args = [user_msg, chat_history] + list(args)
             for response_chunk in chat_response(*backend_args):
                 yield {
                     dbg: response_chunk[1],
                 }
         except GeneratorExit:
             cancelled = True
             print("Generation cancelled by user.")
             raise
         except Exception as e:
             print(f"An error occurred during generation: {e}")
             error_history = (chat_history or []) + [
                 {'role': 'user', 'content': user_msg},
                 {'role': 'assistant', 'content': f"**An error occurred:** {str(e)}"}
             ]
             yield {chat: error_history}
         finally:
             if not cancelled:
                 print("Resetting UI state.")
                 yield {
     def reset_ui_after_cancel():
         """Reset UI components after cancellation."""
+        cancel_event.clear()
         print("UI reset after cancellation.")
         return {
             txt: gr.update(interactive=True),
     )
     # Event for the "Cancel" button.
     cancel_btn.click(
         fn=set_cancel_flag,
         cancels=[submit_event]