import gradio as gr import json import os import re import csv import requests from datetime import datetime from pathlib import Path try: from docx import Document DOCX_AVAILABLE = True except ImportError: DOCX_AVAILABLE = False DATASET_DIR = "datasets" KB_DIR = "knowledge_base" os.makedirs(DATASET_DIR, exist_ok=True) os.makedirs(KB_DIR, exist_ok=True) # ─── AI Providers ─────────────────────────────────────────────────────────── def call_openai(prompt, model="gpt-3.5-turbo", api_key=""): if not api_key: return "⚠️ Введите API ключ OpenAI" try: from openai import OpenAI client = OpenAI(api_key=api_key) resp = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], max_tokens=2000, ) return resp.choices[0].message.content except Exception as e: return f"❌ OpenAI error: {e}" def call_hf_inference(prompt, model="mistralai/Mistral-7B-Instruct-v0.3", api_key=""): url = f"https://api-inference.huggingface.co/models/{model}" headers = {} if api_key: headers["Authorization"] = f"Bearer {api_key}" payload = {"inputs": prompt, "parameters": {"max_new_tokens": 800, "temperature": 0.7}} try: resp = requests.post(url, headers=headers, json=payload, timeout=60) data = resp.json() if isinstance(data, list) and data: return data[0].get("generated_text", str(data)) if isinstance(data, dict): if "error" in data: return f"⚠️ HF: {data['error']}" return data.get("generated_text", str(data)) return str(data) except Exception as e: return f"❌ HF error: {e}" def call_ollama(prompt, model="llama3.2", host="http://localhost:11434"): try: resp = requests.post( f"{host}/api/generate", json={"model": model, "prompt": prompt, "stream": False}, timeout=120, ) return resp.json().get("response", "No response") except Exception as e: return f"❌ Ollama error: {e}" def call_groq(prompt, model="llama-3.1-8b-instant", api_key=""): if not api_key: return "⚠️ Введите API ключ Groq" try: from openai import OpenAI client = OpenAI(api_key=api_key, base_url="https://api.groq.com/openai/v1") resp = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], max_tokens=2000, ) return resp.choices[0].message.content except Exception as e: return f"❌ Groq error: {e}" def call_ai(prompt, provider, model, api_key, ollama_host): if provider == "🤗 HuggingFace (бесплатно)": return call_hf_inference(prompt, model, api_key) elif provider == "⚡ Groq (бесплатно)": return call_groq(prompt, model, api_key) elif provider == "🏠 Ollama (локально)": return call_ollama(prompt, model, ollama_host) elif provider == "🔵 OpenAI": return call_openai(prompt, model, api_key) return "⚠️ Выберите провайдера" # ─── Dataset functions ──────────────────────────────────────────────────────── def extract_text_from_docx(file_path): if not DOCX_AVAILABLE: return "" doc = Document(file_path) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] return "\n\n".join(paragraphs) def split_into_chunks(text, chunk_size=512, overlap=64): words = text.split() chunks = [] i = 0 while i < len(words): chunk = " ".join(words[i: i + chunk_size]) chunks.append(chunk) i += chunk_size - overlap return chunks def text_to_qa_pairs(text, source_name=""): paragraphs = [p.strip() for p in text.split("\n\n") if len(p.strip()) > 60] pairs = [] for i, para in enumerate(paragraphs): pairs.append({ "instruction": "Продолжи текст в стиле автора.", "input": para[:200] if len(para) > 200 else "", "output": para, "source": source_name, }) if i > 0: pairs.append({ "instruction": "Напиши текст на тему: " + para[:80], "input": "", "output": para, "source": source_name, }) return pairs dataset_store = [] def process_files(files, chunk_size, overlap, format_choice, add_qa): global dataset_store if not files: return "⚠️ Файлы не выбраны.", gr.update(interactive=False), "" new_records = [] log_lines = [] for file_obj in files: path = file_obj name = os.path.basename(path) ext = Path(path).suffix.lower() if ext == ".docx": if not DOCX_AVAILABLE: log_lines.append(f"❌ {name}: python-docx не установлен") continue text = extract_text_from_docx(path) elif ext == ".txt": with open(path, encoding="utf-8", errors="ignore") as f: text = f.read() else: log_lines.append(f"⏭ {name}: неподдерживаемый формат") continue if not text.strip(): log_lines.append(f"⚠️ {name}: пустой файл") continue chunks = split_into_chunks(text, int(chunk_size), int(overlap)) for chunk in chunks: new_records.append({ "instruction": "Напиши в стиле автора.", "input": "", "output": chunk, "source": name, }) if add_qa: qa_pairs = text_to_qa_pairs(text, name) new_records.extend(qa_pairs) log_lines.append(f"✅ {name}: {len(chunks)} чанков") dataset_store.extend(new_records) preview = "\n".join( f"[{i+1}] {r['output'][:120]}..." for i, r in enumerate(dataset_store[:5]) ) return ( "\n".join(log_lines) + f"\n\n📦 Всего записей: {len(dataset_store)}", gr.update(interactive=True), preview, ) def save_dataset(format_choice, dataset_name): global dataset_store if not dataset_store: return "⚠️ Буфер пустой.", None ts = datetime.now().strftime("%Y%m%d_%H%M%S") safe_name = re.sub(r"[^a-zA-Z0-9_\-]", "_", dataset_name or "dataset") fname_base = f"{safe_name}_{ts}" if format_choice == "JSONL": out_path = os.path.join(DATASET_DIR, fname_base + ".jsonl") with open(out_path, "w", encoding="utf-8") as f: for rec in dataset_store: f.write(json.dumps(rec, ensure_ascii=False) + "\n") elif format_choice == "JSON": out_path = os.path.join(DATASET_DIR, fname_base + ".json") with open(out_path, "w", encoding="utf-8") as f: json.dump(dataset_store, f, ensure_ascii=False, indent=2) elif format_choice == "CSV": out_path = os.path.join(DATASET_DIR, fname_base + ".csv") with open(out_path, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=["instruction", "input", "output", "source"]) writer.writeheader() writer.writerows(dataset_store) else: out_path = os.path.join(DATASET_DIR, fname_base + ".jsonl") with open(out_path, "w", encoding="utf-8") as f: for rec in dataset_store: f.write(json.dumps(rec, ensure_ascii=False) + "\n") count = len(dataset_store) dataset_store = [] return f"✅ Сохранено: {out_path} ({count} записей)", out_path def list_datasets(): files = list(Path(DATASET_DIR).glob("*.*")) if not files: return "📭 Нет датасетов." lines = [f"• {f.name} ({f.stat().st_size // 1024} KB)" for f in sorted(files)] return "\n".join(lines) def clear_buffer(): global dataset_store dataset_store = [] return "🗑️ Буфер очищен." COLAB_TEMPLATE = """## 🚀 Fine-tuning (LoRA) — Бесплатно через Google Colab ### 1. Установка ```python !pip install -q unsloth transformers peft datasets trl accelerate bitsandbytes ``` ### 2. Загрузка модели ```python from unsloth import FastLanguageModel model, tokenizer = FastLanguageModel.from_pretrained( model_name="MODEL_PLACEHOLDER", max_seq_length=2048, load_in_4bit=True, ) model = FastLanguageModel.get_peft_model( model, r=16, target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], lora_alpha=16, lora_dropout=0, bias="none", use_gradient_checkpointing="unsloth", ) ``` ### 3. Загрузка датасета ```python from datasets import load_dataset dataset = load_dataset("json", data_files="DATASET_PLACEHOLDER", split="train") def format_prompt(ex): return {"text": f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}"} dataset = dataset.map(format_prompt) ``` ### 4. Обучение ```python from trl import SFTTrainer from transformers import TrainingArguments import torch trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, dataset_text_field="text", max_seq_length=2048, args=TrainingArguments( per_device_train_batch_size=2, gradient_accumulation_steps=4, num_train_epochs=3, learning_rate=2e-4, fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), output_dir="outputs", optim="adamw_8bit", ), ) trainer.train() ``` ### 5. Сохранение ```python model.save_pretrained_merged("my_style_model", tokenizer, save_method="merged_16bit") ``` --- **Бесплатные GPU:** Google Colab T4 | Kaggle (30ч/нед) | HuggingFace Spaces""" def get_colab_guide(model_choice, dataset_path): model_map = { "Mistral 7B (рекомендуется)": "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", "Llama 3.1 8B": "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", "Gemma 2 9B": "unsloth/gemma-2-9b-it-bnb-4bit", "Phi-3 Mini (быстрее)": "unsloth/Phi-3-mini-4k-instruct", } guide = COLAB_TEMPLATE.replace( "MODEL_PLACEHOLDER", model_map.get(model_choice, "unsloth/mistral-7b-instruct-v0.3-bnb-4bit") ) guide = guide.replace("DATASET_PLACEHOLDER", dataset_path or "your_dataset.jsonl") return guide # ─── Knowledge Base ────────────────────────────────────────────────────────── kb_store = {} KB_FILE = os.path.join(KB_DIR, "knowledge_base.json") def load_kb(): global kb_store if os.path.exists(KB_FILE): with open(KB_FILE, encoding="utf-8") as f: kb_store = json.load(f) def save_kb_to_disk(): with open(KB_FILE, "w", encoding="utf-8") as f: json.dump(kb_store, f, ensure_ascii=False, indent=2) def get_kb_choices(): return list(kb_store.keys()) def add_kb_entry(name, text, tags_raw, files): if not name.strip(): return "⚠️ Введите название.", gr.update(choices=get_kb_choices()) tags = [t.strip() for t in tags_raw.split(",") if t.strip()] content = text.strip() if files: for fpath in files: ext = Path(fpath).suffix.lower() if ext == ".docx" and DOCX_AVAILABLE: content += "\n\n" + extract_text_from_docx(fpath) elif ext == ".txt": with open(fpath, encoding="utf-8", errors="ignore") as f: content += "\n\n" + f.read() kb_store[name.strip()] = { "text": content, "tags": tags, "created": datetime.now().isoformat(), } save_kb_to_disk() choices = get_kb_choices() return f"✅ '{name}' добавлена в базу.", gr.update(choices=choices) def refresh_kb(): load_kb() return gr.update(choices=get_kb_choices()) def get_kb_entry_content(name): if not name: return "", "", "" load_kb() entry = kb_store.get(name, {}) if not entry: return "", "", "" return entry.get("text", ""), ", ".join(entry.get("tags", [])), entry.get("created", "") def delete_kb_entry(name): if not name: return "⚠️ Выберите запись.", gr.update(choices=get_kb_choices()) load_kb() if name in kb_store: del kb_store[name] save_kb_to_disk() return f"🗑️ '{name}' удалена.", gr.update(choices=get_kb_choices()) return "⚠️ Не найдено.", gr.update(choices=get_kb_choices()) def search_kb(query, selected_entries): load_kb() results = [] search_in = selected_entries if selected_entries else list(kb_store.keys()) q = query.lower() for name in search_in: entry = kb_store.get(name, {}) text = entry.get("text", "") tags = " ".join(entry.get("tags", [])) if q in text.lower() or q in name.lower() or q in tags.lower(): snippet = text[:300].replace("\n", " ") results.append(f"**{name}** [{', '.join(entry.get('tags', []))}]\n{snippet}...") return "\n\n---\n\n".join(results) if results else "🔍 Ничего не найдено." def compose_context_for_writing(selected_entries, writing_task): load_kb() context_parts = [] for name in (selected_entries or []): entry = kb_store.get(name, {}) if entry: context_parts.append(f"=== {name} ===\n{entry['text'][:800]}") context = "\n\n".join(context_parts) if not context: return "⚠️ Выберите записи из Knowledge Base." return f"""Ты пишешь текст в точном стиле автора, используя следующие знания: {context} Задание: {writing_task} [Начало текста в стиле автора]:""" load_kb() # ─── macOS CSS ───────────────────────────────────────────────────────────── macos_css = """ /* === macOS System Font === */ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); * { font-family: -apple-system, BlinkMacSystemFont, 'SF Pro Display', 'SF Pro Text', 'Inter', 'Helvetica Neue', Arial, sans-serif !important; } /* === Base === */ body, .gradio-container { background: #f2f2f7 !important; color: #1c1c1e !important; } footer { display: none !important; } /* === Window chrome === */ .gradio-container > .main { background: #f2f2f7 !important; } /* === Panels === */ .panel, .block, .form { background: rgba(255,255,255,0.85) !important; border: 1px solid rgba(0,0,0,0.08) !important; border-radius: 12px !important; box-shadow: 0 1px 3px rgba(0,0,0,0.08), 0 4px 16px rgba(0,0,0,0.04) !important; backdrop-filter: blur(20px) !important; } /* === Tab bar === */ .tabs > .tab-nav { background: rgba(255,255,255,0.7) !important; border-radius: 10px !important; padding: 4px !important; border: 1px solid rgba(0,0,0,0.08) !important; backdrop-filter: blur(20px) !important; } .tabs > .tab-nav button { border-radius: 7px !important; font-size: 13px !important; font-weight: 500 !important; color: #8e8e93 !important; padding: 6px 14px !important; transition: all 0.15s ease !important; background: transparent !important; border: none !important; } .tabs > .tab-nav button.selected { background: #ffffff !important; color: #1c1c1e !important; font-weight: 600 !important; box-shadow: 0 1px 4px rgba(0,0,0,0.15) !important; } /* === Buttons === */ button.primary, .btn-primary, button[variant="primary"] { background: #007aff !important; color: #ffffff !important; border: none !important; border-radius: 8px !important; font-weight: 600 !important; font-size: 13px !important; padding: 8px 18px !important; transition: all 0.15s ease !important; box-shadow: 0 1px 3px rgba(0,122,255,0.3) !important; } button.primary:hover, button[variant="primary"]:hover { background: #0071eb !important; transform: translateY(-0.5px) !important; box-shadow: 0 2px 8px rgba(0,122,255,0.4) !important; } button.stop, button[variant="stop"] { background: #ff3b30 !important; color: #fff !important; border: none !important; border-radius: 8px !important; font-weight: 600 !important; font-size: 13px !important; } button.secondary, button[variant="secondary"] { background: rgba(0,0,0,0.05) !important; color: #1c1c1e !important; border: 1px solid rgba(0,0,0,0.12) !important; border-radius: 8px !important; font-weight: 500 !important; font-size: 13px !important; } /* === Inputs === */ input[type="text"], textarea, .input-text, select { background: rgba(255,255,255,0.9) !important; border: 1px solid rgba(0,0,0,0.15) !important; border-radius: 8px !important; color: #1c1c1e !important; font-size: 13px !important; padding: 8px 10px !important; transition: border 0.15s ease, box-shadow 0.15s ease !important; } input[type="text"]:focus, textarea:focus { border-color: #007aff !important; box-shadow: 0 0 0 3px rgba(0,122,255,0.15) !important; outline: none !important; } /* === Labels === */ label, .label-wrap span, .block > label > span { font-size: 12px !important; font-weight: 600 !important; color: #6e6e73 !important; letter-spacing: 0.3px !important; text-transform: uppercase !important; } /* === Headings === */ h1 { font-size: 28px !important; font-weight: 700 !important; letter-spacing: -0.5px !important; color: #1c1c1e !important; } h2 { font-size: 20px !important; font-weight: 600 !important; color: #1c1c1e !important; } h3 { font-size: 15px !important; font-weight: 600 !important; color: #1c1c1e !important; } /* === Sliders === */ input[type="range"] { accent-color: #007aff !important; } /* === Checkboxes === */ input[type="checkbox"] { accent-color: #34c759 !important; width: 16px !important; height: 16px !important; } /* === Radio === */ input[type="radio"] { accent-color: #007aff !important; } /* === File upload === */ .upload-container, [data-testid="file-upload"] { background: rgba(0,122,255,0.04) !important; border: 1.5px dashed rgba(0,122,255,0.3) !important; border-radius: 12px !important; transition: all 0.2s ease !important; } .upload-container:hover { background: rgba(0,122,255,0.08) !important; border-color: #007aff !important; } /* === Sidebar / columns === */ .gap { gap: 12px !important; } /* === macOS window title bar decoration === */ .app-header { display: flex; align-items: center; gap: 8px; padding: 0 0 16px 4px; } .traffic-lights { display: flex; gap: 6px; align-items: center; } .tl { width:12px; height:12px; border-radius:50%; display:inline-block; } .tl-red { background:#ff5f57; border:1px solid #e0443e; } .tl-yellow { background:#febc2e; border:1px solid #d4a000; } .tl-green { background:#28c840; border:1px solid #1aab29; } /* === Status boxes === */ .textbox textarea { font-family: 'SF Mono', 'Menlo', 'Monaco', monospace !important; font-size: 12px !important; line-height: 1.5 !important; } /* === Markdown output === */ .prose, .markdown-text { font-size: 14px !important; line-height: 1.6 !important; color: #1c1c1e !important; } /* === AI Chat bubbles === */ .message-wrap .message { border-radius: 16px !important; font-size: 14px !important; } /* === Scrollbar === */ ::-webkit-scrollbar { width: 6px; height: 6px; } ::-webkit-scrollbar-track { background: transparent; } ::-webkit-scrollbar-thumb { background: rgba(0,0,0,0.2); border-radius: 3px; } ::-webkit-scrollbar-thumb:hover { background: rgba(0,0,0,0.35); } /* === AI Chat area === */ .chatbot { border-radius: 12px !important; } /* === Provider selector === */ .provider-pill { display: inline-flex; align-items: center; padding: 4px 10px; border-radius: 20px; font-size: 12px; font-weight: 600; margin: 2px; } """ # ─── Gradio UI ─────────────────────────────────────────────────────────────── PROVIDER_MODELS = { "🤗 HuggingFace (бесплатно)": [ "mistralai/Mistral-7B-Instruct-v0.3", "meta-llama/Meta-Llama-3-8B-Instruct", "HuggingFaceH4/zephyr-7b-beta", "google/gemma-7b-it", "tiiuae/falcon-7b-instruct", ], "⚡ Groq (бесплатно)": [ "llama-3.1-8b-instant", "llama-3.3-70b-versatile", "mixtral-8x7b-32768", "gemma2-9b-it", ], "🏠 Ollama (локально)": [ "llama3.2", "llama3.1", "mistral", "phi3", "gemma2", "qwen2.5", ], "🔵 OpenAI": [ "gpt-4o-mini", "gpt-4o", "gpt-3.5-turbo", ], } def update_models(provider): models = PROVIDER_MODELS.get(provider, []) return gr.update(choices=models, value=models[0] if models else "") # AI chat history chat_history = [] def ai_chat(message, history, provider, model, api_key, ollama_host, system_prompt): if not message.strip(): return history, "" full_prompt = message if system_prompt.strip(): full_prompt = f"{system_prompt}\n\nПользователь: {message}" # Add context from history if history: ctx = "\n".join([f"User: {h[0]}\nAssistant: {h[1]}" for h in history[-3:]]) full_prompt = ctx + "\n\nUser: " + message if system_prompt.strip(): full_prompt = system_prompt + "\n\n" + full_prompt response = call_ai(full_prompt, provider, model, api_key, ollama_host) history = history + [[message, response]] return history, "" def ai_generate_text(prompt, provider, model, api_key, ollama_host): if not prompt.strip(): return "⚠️ Введите запрос" return call_ai(prompt, provider, model, api_key, ollama_host) def ai_generate_with_kb(selected_entries, writing_task, provider, model, api_key, ollama_host): prompt = compose_context_for_writing(selected_entries, writing_task) if prompt.startswith("⚠️"): return prompt return call_ai(prompt, provider, model, api_key, ollama_host) def clear_chat(): return [], "" with gr.Blocks(title="Writing Style AI", css=macos_css, theme=gr.themes.Default( font=gr.themes.GoogleFont("Inter"), primary_hue="blue", secondary_hue="gray", neutral_hue="gray", )) as demo: # macOS window title gr.HTML("""