Spaces:

wolfofbackstreet
/

tiny-gguf-on-cpu

Sleeping

App Files Files Community

wolfofbackstreet commited on Apr 29, 2025

Commit

b1e40ab

1 Parent(s): f17f776

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -25

app.py CHANGED Viewed

@@ -3,12 +3,16 @@ from typing import get_type_hints, Callable, Any
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 model_id = "unsloth/SmolLM2-135M-Instruct-GGUF"
 filename = "SmolLM2-135M-Instruct-Q8_0.gguf"
 tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
 model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)
 def parse_docstring(func):
     doc = inspect.getdoc(func)
@@ -22,6 +26,7 @@ def parse_docstring(func):
     return {"title": title, "description": description}
 def gradio_app_with_docs(func: Callable) -> Callable:
     sig = inspect.signature(func)
     type_hints = get_type_hints(func)
@@ -30,14 +35,12 @@ def gradio_app_with_docs(func: Callable) -> Callable:
     """
     A decorator that automatically builds and launches a Gradio interface
     based on function type hints.
     Args:
         func: A callable with type-hinted parameters and return type.
     Returns:
         The wrapped function with a `.launch()` method to start the app.
     """
-    # Infer Gradio components from type hints
     def _map_type(t: type) -> gr.Component:
         if t == str:
             return gr.Textbox(label="Input")
@@ -47,7 +50,7 @@ def gradio_app_with_docs(func: Callable) -> Callable:
             return gr.Number()
         elif t == bool:
             return gr.Checkbox()
-        elif hasattr(t, "__origin__") and t.__origin__ == list:  # Handle List[type]
             elem_type = t.__args__[0]
             if elem_type == str:
                 return gr.Dropdown(choices=["Option1", "Option2"])
@@ -56,30 +59,24 @@ def gradio_app_with_docs(func: Callable) -> Callable:
         else:
             raise ValueError(f"Unsupported type: {t}")
-    # Extract function signature and type hints
-    sig = inspect.signature(func)
-    type_hints = get_type_hints(func)
-    # Map parameters to Gradio inputs
     inputs = []
     for name, param in sig.parameters.items():
         if name == "self":
-            continue  # Skip self in class methods
         param_type = type_hints.get(name, Any)
         component = _map_type(param_type)
         component.label = name.replace("_", " ").title()
         inputs.append(component)
-    # Map return type to Gradio output
     return_type = type_hints.get("return", Any)
     outputs = _map_type(return_type)
     # Wrap function with Gradio interface
-    interface = gr.Interface(fn=func, inputs=inputs, outputs=outputs)
     with gr.Blocks() as demo:
         gr.Markdown(f"## {metadata['title']}\n{metadata['description']}")
-        interface = gr.Interface(fn=func, inputs=inputs, outputs=outputs)
     def wrapper(*args, **kwargs):
         return func(*args, **kwargs)
@@ -93,27 +90,38 @@ def generate_response(prompt: str) -> str:
     """
     Title: Super Tiny GGUF Model on CPU
     Description: A Simple app to test out the potentials of small GGUF LLM model.
     Args:
         prompt (str): A simple prompt.
     Returns:
         str: Simplified response.
     """
-    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")  # Move inputs to CPU
     outputs = model.generate(
         **inputs,
-        max_new_tokens=50,
-        temperature=0.7,
-        top_p=0.9
     )
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
-# # Example usage
-# prompt = "Explain quantum computing in simple terms."
-# response = generate_response(prompt)
-# print(response)
 if __name__ == "__main__":
     generate_response.launch()

 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
+# --- Load Model and Tokenizer ---
 model_id = "unsloth/SmolLM2-135M-Instruct-GGUF"
 filename = "SmolLM2-135M-Instruct-Q8_0.gguf"
 tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
 model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)
+# --- System Prompt Template ---
+SYSTEM_PROMPT = """You are a helpful AI assistant. Your job is to provide clear and concise responses based on the user's input.
+Keep your answers straightforward and avoid unnecessary information."""
 def parse_docstring(func):
     doc = inspect.getdoc(func)
     return {"title": title, "description": description}
 def gradio_app_with_docs(func: Callable) -> Callable:
     sig = inspect.signature(func)
     type_hints = get_type_hints(func)
     """
     A decorator that automatically builds and launches a Gradio interface
     based on function type hints.
     Args:
         func: A callable with type-hinted parameters and return type.
     Returns:
         The wrapped function with a `.launch()` method to start the app.
     """
     def _map_type(t: type) -> gr.Component:
         if t == str:
             return gr.Textbox(label="Input")
             return gr.Number()
         elif t == bool:
             return gr.Checkbox()
+        elif hasattr(t, "__origin__") and t.__origin__ == list:
             elem_type = t.__args__[0]
             if elem_type == str:
                 return gr.Dropdown(choices=["Option1", "Option2"])
         else:
             raise ValueError(f"Unsupported type: {t}")
+    # Build inputs
     inputs = []
     for name, param in sig.parameters.items():
         if name == "self":
+            continue
         param_type = type_hints.get(name, Any)
         component = _map_type(param_type)
         component.label = name.replace("_", " ").title()
         inputs.append(component)
+    # Build outputs
     return_type = type_hints.get("return", Any)
     outputs = _map_type(return_type)
     # Wrap function with Gradio interface
     with gr.Blocks() as demo:
         gr.Markdown(f"## {metadata['title']}\n{metadata['description']}")
+        gr.Interface(fn=func, inputs=inputs, outputs=outputs)
     def wrapper(*args, **kwargs):
         return func(*args, **kwargs)
     """
     Title: Super Tiny GGUF Model on CPU
     Description: A Simple app to test out the potentials of small GGUF LLM model.
     Args:
         prompt (str): A simple prompt.
     Returns:
         str: Simplified response.
     """
+    # Apply system prompt + user input
+    # full_prompt = f"<|begin_of_text|>System: {SYSTEM_PROMPT}\nUser: {prompt}\nAssistant:"
+    # inputs = tokenizer(full_prompt, return_tensors="pt").to("cpu")
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": prompt}
+    ]
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
+    )
+    inputs = tokenizer([text], return_tensors="pt").to(model.device)
     outputs = model.generate(
         **inputs,
+        max_new_tokens=100,
+        # temperature=0.7,
+        # top_p=0.9
     )
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 if __name__ == "__main__":
     generate_response.launch()