Andro0s commited on
Commit
2636834
·
verified ·
1 Parent(s): 4f41a14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -60
app.py CHANGED
@@ -1,69 +1,32 @@
1
  import os
2
- import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding
4
- from datasets import load_dataset
5
 
6
- # ==============================
7
- # Configuración del modelo
8
- # ==============================
9
- MODEL_NAME = "bigcode/starcoder"
10
- OUTPUT_DIR = "./results"
11
 
12
- # Cargar tokenizer y modelo
13
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
 
15
- # Corregir padding token
16
- if tokenizer.pad_token is None:
17
- tokenizer.pad_token = tokenizer.eos_token # usar EOS como padding
18
- # Si prefieres agregar un token PAD nuevo:
19
- # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
20
 
 
 
21
  model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
22
 
23
- # Si agregaste un token nuevo, redimensionar embeddings
24
- # model.resize_token_embeddings(len(tokenizer))
25
-
26
- # ==============================
27
- # Preparar dataset
28
- # ==============================
29
- # Ejemplo con wikitext (reemplaza con tu dataset)
30
- dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:5%]") # ejemplo pequeño
31
-
32
- def tokenize_function(examples):
33
- return tokenizer(examples["text"], truncation=True)
34
-
35
- tokenized_dataset = dataset.map(tokenize_function, batched=True)
36
-
37
- # ==============================
38
- # Configuración del DataCollator
39
- # ==============================
40
- data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
41
-
42
- # ==============================
43
- # Configuración del Trainer
44
- # ==============================
45
- training_args = TrainingArguments(
46
- output_dir=OUTPUT_DIR,
47
- evaluation_strategy="steps",
48
- per_device_train_batch_size=2,
49
- per_device_eval_batch_size=2,
50
- num_train_epochs=1,
51
- save_steps=10,
52
- save_total_limit=2,
53
- logging_steps=5,
54
- report_to="none",
55
- )
56
 
57
- trainer = Trainer(
58
- model=model,
59
- args=training_args,
60
- train_dataset=tokenized_dataset,
61
- eval_dataset=tokenized_dataset,
62
- tokenizer=tokenizer,
63
- data_collator=data_collator,
64
- )
65
 
66
- # ==============================
67
- # Iniciar entrenamiento
68
- # ==============================
69
- trainer.train()
 
1
  import os
2
+ from huggingface_hub import login
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
4
 
5
+ # Nombre del modelo (público y sin restricciones)
6
+ MODEL_NAME = "bigcode/santacoder"
 
 
 
7
 
8
+ # Obtener el token del entorno (desde Settings → Secrets)
9
+ hf_token = os.environ.get("HF_TOKEN")
10
 
11
+ # Iniciar sesión segura (sin mostrar token)
12
+ if hf_token:
13
+ login(token=hf_token)
14
+ else:
15
+ print("⚠️ No se encontró el token. Agrega 'HF_TOKEN' en Settings → Secrets.")
16
 
17
+ # Cargar el modelo y el tokenizer
18
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
19
  model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
20
 
21
+ # Crear el pipeline
22
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # Ejemplo simple de generación
25
+ def generate_text(prompt):
26
+ output = generator(prompt, max_new_tokens=120, temperature=0.7, top_p=0.95)
27
+ return output[0]["generated_text"]
 
 
 
 
28
 
29
+ # Ejemplo de prueba
30
+ if __name__ == "__main__":
31
+ texto = "AmorCoderAI es una IA creada para"
32
+ print(generate_text(texto))