import os import csv import json import shutil from typing import Optional, List, Any from huggingface_hub import login from transformers import AutoTokenizer, AutoModelForCausalLM from tools import DEFAULT_SYSTEM_MSG # Note: We do NOT import TOOLS here anymore to avoid stale data def authenticate_hf(token: Optional[str]) -> None: """Logs into the Hugging Face Hub.""" if token: print("Logging into Hugging Face Hub...") login(token=token) else: print("Skipping Hugging Face login: HF_TOKEN not set.") def load_model_and_tokenizer(model_name: str): print(f"Loading Transformer model: {model_name}") try: target_model = model_name if model_name.startswith("..") and not os.path.exists(model_name): print(f"Warning: Local path {model_name} not found. Falling back to default hub model.") target_model = "google/gemma-2b-it" tokenizer = AutoTokenizer.from_pretrained(target_model) model = AutoModelForCausalLM.from_pretrained(target_model) print("Model loaded successfully.") return model, tokenizer except Exception as e: print(f"Error loading Transformer model {target_model}: {e}") raise e # UPDATED: Now accepts tools_list as an argument def create_conversation_format(sample, tools_list): """Formats a dataset row into the conversational format required for SFT.""" try: tool_args = json.loads(sample["tool_arguments"]) except (json.JSONDecodeError, TypeError): tool_args = {} return { "messages": [ {"role": "developer", "content": DEFAULT_SYSTEM_MSG}, {"role": "user", "content": sample["user_content"]}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": sample["tool_name"], "arguments": tool_args}}]}, ], "tools": tools_list # Injects the dynamic tools } def parse_csv_dataset(file_path: str) -> List[List[str]]: """Parses an uploaded CSV file.""" dataset = [] if not file_path: return dataset with open(file_path, 'r', newline='', encoding='utf-8') as f: reader = csv.reader(f) try: header = next(reader) if not (header and "user_content" in header[0].lower()): f.seek(0) except StopIteration: return dataset for row in reader: if len(row) >= 3: dataset.append([s.strip() for s in row[:3]]) return dataset def zip_directory(source_dir: str, output_name_base: str) -> str: """Zips a directory.""" return shutil.make_archive( base_name=output_name_base, format='zip', root_dir=source_dir, )