# app.py (LoRA-only loading) import gradio as gr from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, pipeline import torch import os import re import json import time from datetime import datetime from huggingface_hub import model_info # ===== Settings ===== device = 0 if torch.cuda.is_available() else -1 lora_repo = "rahul7star/GPT-Diffuser-v1" # ONLY LoRA fine-tuned repo log_lines = [] def log(msg): line = f"[{datetime.now().strftime('%H:%M:%S')}] {msg}" print(line) log_lines.append(line) log(f"🚀 Loading LoRA-only model from {lora_repo}") log(f"Device: {'GPU' if device==0 else 'CPU'}") # ====== Tokenizer ====== try: tokenizer = AutoTokenizer.from_pretrained(lora_repo, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token log(f"✅ Tokenizer loaded: vocab size {tokenizer.vocab_size}") except Exception as e: log(f"❌ Tokenizer load failed: {e}") tokenizer = None # ====== LoRA-only model ====== model = None pipe = None try: model = AutoModelForCausalLM.from_pretrained( lora_repo, trust_remote_code=True, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" if torch.cuda.is_available() else None, ) model.eval() log("✅ LoRA-only model loaded successfully") pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, device=device, ) log("✅ Pipeline ready for inference") except Exception as e: log(f"❌ LoRA model load failed: {e}") # ====== Chat Function ====== def chat_with_model(message, history): log_lines.clear() log(f"💭 User message: {message}") if pipe is None: return "", history, "⚠️ Model pipeline not loaded." context = "The following is a conversation between a user and an AI assistant trained on GIT souce code.\n" for user, bot in history: context += f"User: {user}\nAssistant: {bot}\n" context += f"User: {message}\nAssistant:" log("📄 Built conversation context") log(context) start_time = time.time() try: output = pipe( context, max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1, )[0]["generated_text"] log(f"⏱️ Inference took {time.time() - start_time:.2f}s") except Exception as e: log(f"❌ Generation failed: {e}") return "", history, "\n".join(log_lines) # Clean reply reply = output[len(context):].strip() reply = re.sub(r"(ContentLoaded|<\/?[^>]+>|[\r\n]{2,})", " ", reply) reply = re.sub(r"\s{2,}", " ", reply).strip() reply = reply.split("User:")[0].split("Assistant:")[0].strip() log(f"🪄 Model reply: {reply}") history.append((message, reply)) return "", history, "\n".join(log_lines) # ===== Gradio ===== with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: gr.Markdown("## 💬 Qwen LoRA-only — Bhagavad Gita Assistant") with gr.Row(): with gr.Column(scale=2): chatbot = gr.Chatbot(height=500) msg = gr.Textbox(placeholder="Ask about the Gita...", label="Your Message") clear = gr.Button("Clear") with gr.Column(scale=1): log_box = gr.Textbox(label="Detailed Model Log", lines=25, interactive=False) msg.submit(chat_with_model, [msg, chatbot], [msg, chatbot, log_box]) clear.click(lambda: (None, None, ""), None, [chatbot, log_box], queue=False) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)