tarnava commited on
Commit
58ae25b
·
verified ·
1 Parent(s): b72b12f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -11
app.py CHANGED
@@ -1,9 +1,12 @@
 
 
 
1
  import torch
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  from peft import PeftModel
4
  import gradio as gr
5
 
6
- # --- Models Load (CPU ke liye optimized) ---
7
  BASE_MODEL = "Qwen/Qwen2.5-1.5B"
8
  LORA_ADAPTER = "modular-ai/qwen"
9
 
@@ -11,10 +14,10 @@ print("Loading base model on CPU... (ye 1-2 min lagega pehli baar)")
11
 
12
  base_model = AutoModelForCausalLM.from_pretrained(
13
  BASE_MODEL,
14
- torch_dtype=torch.float32, # CPU pe float16 nahi chalta
15
- device_map="cpu", # Sirf CPU
16
  trust_remote_code=True,
17
- low_cpu_mem_usage=True # Memory bachaye
18
  )
19
 
20
  print("Loading LoRA adapter...")
@@ -24,7 +27,7 @@ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
24
  if tokenizer.pad_token is None:
25
  tokenizer.pad_token = tokenizer.eos_token
26
 
27
- # --- Chat Function (Fast & Safe) ---
28
  def ask_kant(message, history):
29
  prompt = f"### Instruction: You are Immanuel Kant.\n\n### Input: {message}\n\n### Response:"
30
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
@@ -44,19 +47,17 @@ def ask_kant(message, history):
44
  bot_reply = response.split("### Response:")[-1].strip()
45
  return bot_reply
46
 
47
- # --- Gradio UI (Simple & Fast) ---
48
  with gr.Blocks() as demo:
49
- gr.Markdown("# 🧠 **Kant AI** – Qwen2.5-1.5B LoRA")
50
  gr.Markdown("**Zero GPU | Free | Live Demo** \nPoochein koi bhi sawal, *Immanuel Kant* jawab denge!")
51
 
52
  chatbot = gr.ChatInterface(
53
  fn=ask_kant,
54
- title="",
55
  examples=[
56
  "What is freedom?",
57
  "Kya hai swatantrata?",
58
- "Explain categorical imperative",
59
- "Moral law kya hai?"
60
  ],
61
  cache_examples=False,
62
  submit_btn="Ask Kant",
@@ -64,4 +65,5 @@ with gr.Blocks() as demo:
64
 
65
  gr.Markdown("---\n*Model: Qwen2.5-1.5B + LoRA | CPU Only | ~8-12 sec per reply*")
66
 
67
- demo.launch()
 
 
1
+ import os
2
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # faster download
3
+
4
  import torch
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
  from peft import PeftModel
7
  import gradio as gr
8
 
9
+ # --- Models Load (CPU Only) ---
10
  BASE_MODEL = "Qwen/Qwen2.5-1.5B"
11
  LORA_ADAPTER = "modular-ai/qwen"
12
 
 
14
 
15
  base_model = AutoModelForCausalLM.from_pretrained(
16
  BASE_MODEL,
17
+ torch_dtype=torch.float32,
18
+ device_map="cpu",
19
  trust_remote_code=True,
20
+ low_cpu_mem_usage=True
21
  )
22
 
23
  print("Loading LoRA adapter...")
 
27
  if tokenizer.pad_token is None:
28
  tokenizer.pad_token = tokenizer.eos_token
29
 
30
+ # --- Chat Function ---
31
  def ask_kant(message, history):
32
  prompt = f"### Instruction: You are Immanuel Kant.\n\n### Input: {message}\n\n### Response:"
33
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
 
47
  bot_reply = response.split("### Response:")[-1].strip()
48
  return bot_reply
49
 
50
+ # --- Gradio UI ---
51
  with gr.Blocks() as demo:
52
+ gr.Markdown("# Kant AI – Qwen2.5-1.5B LoRA")
53
  gr.Markdown("**Zero GPU | Free | Live Demo** \nPoochein koi bhi sawal, *Immanuel Kant* jawab denge!")
54
 
55
  chatbot = gr.ChatInterface(
56
  fn=ask_kant,
 
57
  examples=[
58
  "What is freedom?",
59
  "Kya hai swatantrata?",
60
+ "Explain categorical imperative"
 
61
  ],
62
  cache_examples=False,
63
  submit_btn="Ask Kant",
 
65
 
66
  gr.Markdown("---\n*Model: Qwen2.5-1.5B + LoRA | CPU Only | ~8-12 sec per reply*")
67
 
68
+ # --- Ye Line Fix Karegi Error ---
69
+ demo.launch(share=False, server_name="0.0.0.0", server_port=7860)