"""Datu priekšapstrāde.""" from __future__ import annotations import json import re from typing import Any def clean_text(text: str) -> str: """Notīra tekstu no nevēlamiem simboliem.""" text = re.sub(r"\s+", " ", text) text = text.strip() return text def truncate(text: str, max_chars: int = 4096) -> str: """Apgriež tekstu līdz max_chars.""" return text[:max_chars] def format_conversation(messages: list[dict[str, str]]) -> str: """Formatē sarunu kā vienu tekstu apmācībai.""" parts = [] for msg in messages: role = msg.get("role", "user") content = msg.get("content", "") parts.append(f"<|{role}|>\n{content}\n<|end|>") return "\n".join(parts) def _preserve_block_text(value: Any) -> str: return str(value or "").strip() def _append_section(lines: list[str], title: str, value: Any) -> None: if value is None: return if isinstance(value, str): text = _preserve_block_text(value) if text: lines.append(f"{title}:\n{text}") return if isinstance(value, list): items = [_preserve_block_text(item) for item in value if _preserve_block_text(item)] if items: lines.append(f"{title}:\n" + "\n".join(f"- {item}" for item in items)) return if isinstance(value, dict) and value: lines.append(f"{title}:\n{json.dumps(value, ensure_ascii=False, indent=2, sort_keys=True)}") def _format_structured_prompt_record(record: dict[str, Any]) -> str: prompt = clean_text(str(record.get("prompt", ""))) user_sections = [prompt] section_fields = ( ("Repo konteksts", record.get("repo_context")), ("Mērķa fails", record.get("target_file")), ("Esošais vai kļūdainais kods", record.get("buggy_code")), ("Refactor vai diff konteksts", record.get("diff")), ("Papildu konteksts", record.get("context")), ("Pieņemšanas kritēriji", record.get("acceptance_criteria")), ("Testi", record.get("tests")), ("Robežgadījumi", record.get("edge_cases")), ) for title, value in section_fields: _append_section(user_sections, title, value) metadata = record.get("metadata") if metadata: _append_section(user_sections, "Metadata", metadata) messages = [ {"role": "user", "content": "\n\n".join(section for section in user_sections if section)} ] completion = _preserve_block_text(record.get("completion")) if completion: messages.append({"role": "assistant", "content": completion}) elif metadata: messages.append( { "role": "assistant", "content": json.dumps(metadata, ensure_ascii=False, sort_keys=True), } ) return format_conversation(messages) def record_to_training_text(record: dict[str, Any], max_chars: int = 4096) -> str: """Pārveido vienu HF dataset ierakstu uz tekstu kauzālai apmācībai.""" if "text" in record and isinstance(record["text"], str): return truncate(clean_text(record["text"]), max_chars=max_chars) if "user" in record or "assistant" in record: messages = [ {"role": "user", "content": clean_text(str(record.get("user", "")))}, {"role": "assistant", "content": clean_text(str(record.get("assistant", "")))}, ] return truncate(format_conversation(messages), max_chars=max_chars) if "prompt" in record: return truncate(_format_structured_prompt_record(record), max_chars=max_chars) serialized = json.dumps(record, ensure_ascii=False, sort_keys=True) return truncate(clean_text(serialized), max_chars=max_chars)