Spaces:

sterepando
/

PUBTEST

Paused

App Files Files Community

PUBTEST / app.py

sterepando

Update app.py

bbdc54d verified 28 days ago

raw

history blame contribute delete

8.68 kB

	import gradio as gr
	import json
	import os
	import torch
	import pandas as pd
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	TrainingArguments,
	BitsAndBytesConfig
	)
	from peft import LoraConfig, get_peft_model, TaskType, PeftModel
	from trl import SFTTrainer
	from datasets import Dataset

	# --- КОНФИГУРАЦИЯ ---
	MODEL_ID = "Maincode/Maincoder-1B"
	OUTPUT_DIR = "mandre_qlora_adapter"
	JSON_FILE_NAME = "train_data.json"

	# Глобальные переменные для чата
	chat_model = None
	chat_tokenizer = None

	# ==========================================
	# ЧАСТЬ 1: ГЕНЕРАТОР ДАТАСЕТА
	# ==========================================

	def generate_json_dataset(files):
	if not files:
	return None, "❌ Ошибка: Вы не загрузили файлы."

	data_entries = []

	for file_item in files:
	if isinstance(file_item, str):
	file_path = file_item
	elif hasattr(file_item, 'name'):
	file_path = file_item.name
	else:
	continue

	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	except Exception as e:
	print(f"Skipping file {file_path}: {e}")
	continue

	filename = os.path.basename(file_path)

	instruction = f"Analyze the code/text in file '{filename}' from the MandreLib project."
	text = f"### Instruction:\n{instruction}\n\n### Response:\n{content}<\|endoftext\|>"

	data_entries.append({"text": text})

	if not data_entries:
	return None, "❌ Не удалось прочитать ни один текстовый файл."

	try:
	with open(JSON_FILE_NAME, 'w', encoding='utf-8') as f:
	json.dump(data_entries, f, indent=4, ensure_ascii=False)
	abs_path = os.path.abspath(JSON_FILE_NAME)
	return abs_path, f"✅ Готово! Обработано файлов: {len(data_entries)}. Файл {JSON_FILE_NAME} создан."
	except Exception as e:
	return None, f"❌ Ошибка записи JSON: {e}"

	# ==========================================
	# ЧАСТЬ 2: ОБУЧЕНИЕ (ИСПРАВЛЕНО)
	# ==========================================

	def train_mandre_ai(file_obj, epochs, lr):
	if file_obj is None:
	if os.path.exists(JSON_FILE_NAME):
	json_path = JSON_FILE_NAME
	yield f"⚠️ Файл не передан, используем {JSON_FILE_NAME} из прошлой генерации."
	else:
	yield "❌ Ошибка: Нет файла с данными!"
	return
	else:
	json_path = file_obj.name if hasattr(file_obj, 'name') else file_obj

	yield f"🚀 Старт обучения {MODEL_ID}..."

	try:
	# 1. Загрузка данных
	with open(json_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	dataset = Dataset.from_pandas(pd.DataFrame(data))

	yield f"📊 Данные: {len(dataset)} строк. Загрузка токенизатора..."

	# 2. Токенизатор (FIX: use_fast=False чтобы избежать ошибки Rust)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False, trust_remote_code=True)
	tokenizer.pad_token = tokenizer.eos_token

	# 3. LoRA Config
	peft_config = LoraConfig(
	task_type=TaskType.CAUSAL_LM,
	inference_mode=False,
	r=16,
	lora_alpha=32,
	lora_dropout=0.05,
	target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj']
	)

	# 4. Аргументы
	training_args = TrainingArguments(
	output_dir=OUTPUT_DIR,
	num_train_epochs=float(epochs),
	per_device_train_batch_size=2,
	gradient_accumulation_steps=4,
	learning_rate=float(lr),
	weight_decay=0.01,
	use_cpu=True,
	no_cuda=True,
	fp16=False,
	logging_steps=1,
	save_total_limit=1,
	push_to_hub=False,
	report_to="none"
	)

	yield "📥 Загрузка модели (Maincoder-1B)..."

	# 5. Загрузка модели
	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
	model.gradient_checkpointing_enable()
	model.enable_input_require_grads()

	yield "🧠 Инициализация тренера..."

	# 6. Trainer
	trainer = SFTTrainer(
	model=model,
	args=training_args,
	train_dataset=dataset,
	dataset_text_field="text",
	peft_config=peft_config,
	tokenizer=tokenizer,
	max_seq_length=1024
	)

	yield "🔥 ОБУЧЕНИЕ ЗАПУЩЕНО! Ждите завершения..."

	trainer.train()

	yield "💾 Сохранение..."
	trainer.model.save_pretrained(OUTPUT_DIR)
	tokenizer.save_pretrained(OUTPUT_DIR)

	yield f"✅ УСПЕХ! Адаптер в папке '{OUTPUT_DIR}'. Можно чатиться."

	except Exception as e:
	import traceback
	yield f"❌ ОШИБКА:\n{traceback.format_exc()}"

	# ==========================================
	# ЧАСТЬ 3: ЧАТ
	# ==========================================

	def load_chat_model():
	global chat_model, chat_tokenizer
	if chat_model is not None: return "Уже загружено"

	try:
	# FIX: use_fast=False и здесь
	chat_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False, trust_remote_code=True)

	if os.path.exists(os.path.join(OUTPUT_DIR, "adapter_config.json")):
	base = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
	chat_model = PeftModel.from_pretrained(base, OUTPUT_DIR)
	return f"✅ Адаптер QLoRA загружен!"
	else:
	chat_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
	return "⚠️ Адаптер не найден. Работает чистая модель."

	except Exception as e:
	return f"Ошибка: {e}"

	def generate_answer(prompt, history):
	if not chat_model:
	status = load_chat_model()
	if "Ошибка" in status: return status

	formatted_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"
	inputs = chat_tokenizer(formatted_prompt, return_tensors="pt")

	outputs = chat_model.generate(
	**inputs,
	max_new_tokens=300,
	do_sample=True,
	temperature=0.6,
	top_p=0.95
	)

	response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
	if "### Response:" in response:
	return response.split("### Response:")[-1].strip()
	return response

	# ==========================================
	# ИНТЕРФЕЙС
	# ==========================================

	with gr.Blocks(title="MandreAI Fix") as demo:
	gr.Markdown("# 🦎 MandreAI 1B (CPU Fix)")

	with gr.Tabs():
	with gr.Tab("1. Датасет"):
	files_input = gr.File(file_count="multiple", label="Исходные файлы")
	btn_gen = gr.Button("Создать JSON", variant="primary")
	json_output = gr.File(label="Готовый датасет")
	status_output = gr.Textbox(label="Статус")
	btn_gen.click(generate_json_dataset, inputs=[files_input], outputs=[json_output, status_output])

	with gr.Tab("2. Обучение"):
	with gr.Row():
	train_file_input = gr.File(label="train_data.json")
	epochs = gr.Number(value=3, label="Эпохи", precision=0)
	lr = gr.Number(value=2e-4, label="LR")

	btn_train = gr.Button("ЗАПУСТИТЬ ОБУЧЕНИЕ", variant="stop")
	log_output = gr.Textbox(label="Лог", lines=10)

	btn_train.click(train_mandre_ai, inputs=[train_file_input, epochs, lr], outputs=[log_output])

	with gr.Tab("3. Чат"):
	chatbot = gr.Chatbot(label="MandreAI")
	msg_input = gr.Textbox(label="Вопрос")
	btn_send = gr.Button("Отправить")
	btn_send.click(generate_answer, [msg_input, chatbot], chatbot)

	if __name__ == "__main__":
	demo.queue().launch(allowed_paths=["."])