Spaces:
Paused
Paused
File size: 5,592 Bytes
c8c122b 24c9455 c8c122b 923da15 c8c122b 923da15 c8c122b 24c9455 c8c122b 24c9455 2b7ffcf c8c122b 24c9455 2b7ffcf c8c122b 2b7ffcf 24c9455 2b7ffcf 24c9455 c8c122b 3bf755c 24c9455 3bf755c 2b7ffcf 3bf755c 24c9455 2b7ffcf 3bf755c 2b7ffcf c8c122b 2b7ffcf 923da15 24c9455 2b7ffcf 24c9455 923da15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | import gradio as gr, requests, json, time, threading, torch, os, random, math
from transformers import AutoModelForCausalLM, AutoTokenizer
import urllib3; urllib3.disable_warnings()
sid = os.environ.get("SPACE_ID") or os.environ.get("SPACE_NAME") or ""
NUM = ""
for part in sid.split("-"):
if part.isdigit(): NUM = part
if not NUM: NUM = "1"
TEACHER_ID = "hf-worker-" + NUM
N = int(NUM)
# === 8 MODELES du plus petit au plus gros ===
# Choisi selon le numero du worker (round-robin)
ALL_MODELS = [
("HuggingFaceTB/SmolLM2-360M-Instruct", 0.36),
("Qwen/Qwen2.5-0.5B-Instruct", 0.5),
("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 1.1),
("deepseek-ai/deepseek-coder-1.3b-instruct", 1.3),
("Qwen/Qwen2.5-1.5B-Instruct", 1.5),
("Qwen/Qwen2.5-Coder-1.5B-Instruct", 1.5),
("stabilityai/stablelm-2-1_6b", 1.6),
("HuggingFaceTB/SmolLM2-1.7B-Instruct", 1.7),
]
MID, B = ALL_MODELS[N % len(ALL_MODELS)]
DELAY = max(3, int(5 + B * 10 - N % 10)) # 8-19s, petit modele = delai court
print(f"ID:{TEACHER_ID} MODEL:{MID} ({B}B) DELAY:{DELAY}s")
BIP="3.125.223.134"
BH="alfredo-agravic-saddeningly.ngrok-free.dev"
m=None; t=None; s=requests.Session(); s.verify=False
def g(p): return s.get("https://"+BIP+p,headers={"Host":BH},timeout=60)
def p(p,d): return s.post("https://"+BIP+p,headers={"Host":BH},json=d,timeout=30)
def l():
global m,t
m=AutoModelForCausalLM.from_pretrained(MID,torch_dtype=torch.float32,device_map="cpu",trust_remote_code=True)
t=AutoTokenizer.from_pretrained(MID,trust_remote_code=True)
if t.pad_token is None: t.pad_token=t.eos_token
def gen(ms, max_tokens, temp=0.7):
x=t.apply_chat_template(ms,tokenize=False)
i=t(x,return_tensors="pt").to("cpu")
with torch.no_grad():
o=m.generate(**i,max_new_tokens=max_tokens,temperature=temp,do_sample=True,
top_p=0.9,top_k=40,repetition_penalty=1.05,pad_token_id=t.pad_token_id)
return t.decode(o[0][i["input_ids"].shape[1]:],skip_special_tokens=True).strip()
# Prompts adaptes a la taille du modele
if B < 0.5:
# Ultra-petit: prompts tres courts, reponses courtes
PROMPTS = [
lambda s: (f"Explique {s} en 1 phrase.", 64, 0.7),
lambda s: (f"C'est quoi {s} ?", 64, 0.7),
lambda s: (f"Donne 1 fait sur {s}.", 64, 0.8),
lambda s: (f"Pourquoi {s} est utile ?", 96, 0.7),
lambda s: (f"Un conseil sur {s}.", 96, 0.7),
lambda s: (f"Decris {s} en 2 lignes.", 96, 0.7),
]
elif B < 1.0:
# Petit: prompts simples, reponses courtes
PROMPTS = [
lambda s: (f"Explique {s} simplement.", 128, 0.7),
lambda s: (f"C'est quoi {s} ? Donne un exemple.", 128, 0.7),
lambda s: (f"Quels sont les points cles de {s} ?", 160, 0.6),
lambda s: (f"Pourquoi {s} est important ?", 128, 0.8),
lambda s: (f"Compare {s} avec son alternative.", 160, 0.7),
lambda s: (f"Comment utiliser {s} ?", 160, 0.7),
]
else:
# Moyen/Gros: prompts normaux, reponses completes
PROMPTS = [
lambda s: (f"Explique {s} en detail avec des exemples concrets.", 256, 0.7),
lambda s: (f"Analyse {s} : fonctionnement, applications, limites.", 320, 0.7),
lambda s: (f"Quels sont les concepts cles de {s} ? Liste structuree.", 256, 0.6),
lambda s: (f"Compare {s} avec son alternative. Avantages/inconvenients.", 320, 0.7),
lambda s: (f"Tutoriel pas a pas pour utiliser {s}.", 320, 0.7),
lambda s: (f"L'histoire et l'avenir de {s}.", 320, 0.7),
]
if "coder" in MID.lower():
# Workers code: prompts orientes code
PROMPTS = [
lambda s: (f"Write code example illustrating {s}.", 256, 0.6),
lambda s: (f"Explain {s} with a code snippet.", 256, 0.7),
lambda s: (f"How to implement {s}? Step by step.", 320, 0.7),
lambda s: (f"Best practices for {s} in programming.", 256, 0.7),
lambda s: (f"Compare approaches for {s} with code.", 320, 0.7),
lambda s: (f"Common bugs with {s} and how to fix them.", 256, 0.7),
]
def w():
global m,t; l()
cycle = 0
while True:
try:
p("/heartbeat",{"teacher":TEACHER_ID,"model":MID})
r=g("/next-batch?teacher="+TEACHER_ID+"&batch_size=5")
if r.status_code==200:
es=r.json().get("entries",[])
if es:
rs=[]
for e in es:
sj=e.get("subject","")
if not sj: continue
for idx in range(6):
pi = (hash(TEACHER_ID + sj + str(cycle + idx))) % len(PROMPTS)
fn = PROMPTS[pi]
try:
inst, mt, tmp = fn(sj)
resp=gen([{"role":"system","content":"Tu es Connor."},{"role":"user","content":inst}],
max_tokens=mt, temp=tmp)
if resp and len(resp)>20:
rs.append({"instruction":inst,"input":"","output":resp,
"teacher":TEACHER_ID,"subject":sj,"model":MID})
except: pass
if rs:
p("/push-results",{"teacher":TEACHER_ID,"results":rs})
cycle += 1
time.sleep(DELAY)
except Exception as e:
time.sleep(30)
threading.Thread(target=w,daemon=True).start()
gr.Interface(fn=lambda:json.dumps({"status":"ok","id":TEACHER_ID,"model":MID}),inputs=[],outputs="text").launch()
|