AI_Chatbot / app.py
LejobuildYT's picture
Upload 18 files
70b7b2b verified
#!/usr/bin/env python3
"""
Hugging Face Spaces Backend - Qwen 1.5B Instruct
Leicht, schnell und speichereffizient
"""
import os
import json
import logging
import gradio as gr
from pathlib import Path
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
pipeline
)
import time
# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_TOKENS = 512
TEMPERATURE = 0.7
TOP_P = 0.9
# Auto-Select best model for available memory
def select_model():
"""Nutze Qwen 1.5B - klein und schnell!"""
return "Qwen/Qwen2.5-1.5B-Instruct"
return "TheBloke/zephyr-7B-beta-AWQ"
MODEL_NAME = os.getenv("MODEL_NAME", select_model())
logger.info(f"📌 Using model: {MODEL_NAME}")
# Plugin System
PLUGIN_DIR = Path("plugins")
loaded_plugins = {}
def load_plugins():
"""Load Python plugins from plugins directory"""
if not PLUGIN_DIR.exists():
logger.warning(f"Plugin directory {PLUGIN_DIR} does not exist")
return
for plugin_file in PLUGIN_DIR.glob("*.py"):
if plugin_file.name.startswith("__"):
continue
try:
plugin_name = plugin_file.stem
logger.info(f"Loading plugin: {plugin_name}")
loaded_plugins[plugin_name] = plugin_file
logger.info(f"✓ Plugin {plugin_name} loaded")
except Exception as e:
logger.error(f"✗ Failed to load plugin {plugin_file}: {e}")
def call_plugin_hook(hook_name, *args, **kwargs):
"""Call a plugin hook if it exists"""
for plugin_name, plugin_file in loaded_plugins.items():
try:
pass
except Exception as e:
logger.error(f"Error calling hook in {plugin_name}: {e}")
# Initialize Model mit Quantization
logger.info(f"⏳ Loading model {MODEL_NAME} on {DEVICE}...")
def load_model_optimized():
"""Qwen 1.5B - kein Quantization nötig, ist schon klein!"""
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto" if DEVICE == "cuda" else None,
torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
)
logger.info(f"✅ {MODEL_NAME} loaded successfully")
return tokenizer, model
try:
tokenizer, model = load_model_optimized()
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=0 if DEVICE == "cuda" else -1,
)
logger.info("✅ Model loaded successfully")
load_in_8bit=False,
)
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=0 if DEVICE == "cuda" else -1,
)
logger.info("✓ Model loaded successfully")
except Exception as e:
logger.error(f"✗ Failed to load model: {e}")
raise
def generate_response(prompt: str, system_prompt: str = None) -> dict:
"""
Generate response using Zephyr model
Args:
prompt: User input
system_prompt: Optional system prompt
Returns:
dict with response, tokens, and timing
"""
try:
start_time = time.time()
# Qwen message format
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
# Generate
outputs = pipe(
messages,
max_new_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
top_p=TOP_P,
do_sample=True,
return_full_text=False,
)
response_text = outputs[0]["generated_text"].strip()
elapsed = time.time() - start_time
result = {
"response": response_text,
"tokens": len(tokenizer.encode(response_text)),
"time_seconds": round(elapsed, 2),
"model": MODEL_NAME,
}
logger.info(f"Generated {result['tokens']} tokens in {result['time_seconds']}s")
return result
except Exception as e:
logger.error(f"Error generating response: {e}")
return {
"response": f"Error: {str(e)}",
"tokens": 0,
"time_seconds": 0,
"error": True,
}
# Load plugins
load_plugins()
# Gradio Interface
with gr.Blocks(title="Zephyr-7B AI Chatbot") as demo:
gr.Markdown("# 🤖 Zephyr-7B Inference Server")
gr.Markdown("Powered by Hugging Face & Gradio")
with gr.Row():
with gr.Column(scale=2):
system_prompt = gr.Textbox(
label="System Prompt",
placeholder="Optional: Definiere die Rolle des Assistenten...",
lines=3,
)
with gr.Column(scale=1):
temperature = gr.Slider(
label="Temperature",
minimum=0.0,
maximum=2.0,
value=TEMPERATURE,
step=0.1,
)
top_p = gr.Slider(
label="Top P",
minimum=0.0,
maximum=1.0,
value=TOP_P,
step=0.05,
)
user_input = gr.Textbox(
label="Your Message",
placeholder="Type your message...",
lines=4,
)
submit_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
output_response = gr.Textbox(
label="Response",
interactive=False,
lines=6,
)
output_stats = gr.JSON(label="Statistics", interactive=False)
def process_input(prompt, system, temp, top):
old_temp = globals()["TEMPERATURE"]
old_top = globals()["TOP_P"]
globals()["TEMPERATURE"] = temp
globals()["TOP_P"] = top
result = generate_response(prompt, system)
globals()["TEMPERATURE"] = old_temp
globals()["TOP_P"] = old_top
stats = {
"Tokens": result["tokens"],
"Time (s)": result["time_seconds"],
"Model": result["model"],
}
return result["response"], stats
submit_btn.click(
process_input,
inputs=[user_input, system_prompt, temperature, top_p],
outputs=[output_response, output_stats],
)
# Examples
gr.Examples(
examples=[
["Was ist Machine Learning?", "Du bist ein hilfsbereiter AI-Assistent.", 0.7, 0.9],
["Schreibe einen kurzen Witz", "", 0.9, 0.95],
["Erkläre Quantencomputing in einfachen Worten", "Du bist ein Physik-Professor.", 0.7, 0.9],
],
inputs=[user_input, system_prompt, temperature, top_p],
)
if __name__ == "__main__":
demo.queue().launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
)