Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer | |
| import csv | |
| import yaml | |
| from datasets import Dataset | |
| import tensorflow as tf | |
| # Check TensorFlow GPU availability | |
| print("GPUs Available: ", tf.config.list_physical_devices('GPU')) | |
| import os | |
| os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0' | |
| def load_data_and_config(data_path): | |
| """Loads training data from CSV.""" | |
| data = [] | |
| with open(data_path, newline='', encoding='utf-8') as csvfile: | |
| reader = csv.DictReader(csvfile, delimiter=';') # Ensure delimiter matches your CSV file | |
| for row in reader: | |
| data.append({'text': row['description']}) # Changed from 'text' to 'description' | |
| return data | |
| def generate_api_query(model, tokenizer, prompt, desired_output, api_name, base_url): | |
| """Generates an API query using a fine-tuned model.""" | |
| input_ids = tokenizer.encode(prompt + f" Write an API query to {api_name} to get {desired_output}", return_tensors="pt") | |
| input_ids = input_ids.to(model.device) # Ensure input_ids are on the same device as the model | |
| output = model.generate(input_ids, max_length=256, temperature=0.7, do_sample=True) # Enable sampling with temperature control | |
| query = tokenizer.decode(output[0], skip_special_tokens=True) | |
| return f"{base_url}/{query}" | |
| from transformers import TrainingArguments, Trainer | |
| def train_model(model, tokenizer, data): | |
| """Trains the model using the Hugging Face Trainer API.""" | |
| # Encode data and prepare labels | |
| inputs = [tokenizer(d['text'], max_length=512, truncation=True, padding='max_length', return_tensors="pt") for d in data] | |
| dataset = Dataset.from_dict({ | |
| 'input_ids': [x['input_ids'].squeeze() for x in inputs], # remove extra dimensions | |
| 'labels': [x['input_ids'].squeeze() for x in inputs] | |
| }) | |
| training_args = TrainingArguments( | |
| output_dir='./results', | |
| num_train_epochs=3, | |
| per_device_train_batch_size=1, | |
| gradient_accumulation_steps=1, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| logging_dir='./logs', | |
| logging_steps=10, | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=dataset, | |
| tokenizer=tokenizer | |
| ) | |
| # The Trainer handles the training loop internally | |
| trainer.train() | |
| # Optionally clear cache if using GPU or MPS | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| elif torch.backends.mps.is_built(): | |
| torch.mps.empty_cache() | |
| # Perform any remaining steps such as logging, saving, etc. | |
| trainer.save_model() | |
| def main(api_name, base_url): | |
| # Load data | |
| data = load_data_and_config("train2.csv") | |
| # Load tokenizer and model | |
| tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small") | |
| model = AutoModelForCausalLM.from_pretrained("thenlper/gte-small") | |
| # Train the model on your dataset | |
| train_model(model, tokenizer, data) | |
| # Save the fine-tuned model | |
| model.save_pretrained("./fine_tuned_model") | |
| tokenizer.save_pretrained("./fine_tuned_model") | |
| # Example usage | |
| prompt = "I need to retrieve the latest block on chain using a python script" | |
| api_query = generate_api_query(model, tokenizer, prompt, "latest block on chain", api_name, base_url) | |
| print(f"Generated code: {api_query}") | |
| if __name__ == "__main__": | |
| api_name = "Koios" | |
| base_url = "https://api.koios.rest" | |
| main(api_name, base_url) | |