| """Datu priekšapstrāde.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| import re |
| from typing import Any |
|
|
|
|
| def clean_text(text: str) -> str: |
| """Notīra tekstu no nevēlamiem simboliem.""" |
| text = re.sub(r"\s+", " ", text) |
| text = text.strip() |
| return text |
|
|
|
|
| def truncate(text: str, max_chars: int = 4096) -> str: |
| """Apgriež tekstu līdz max_chars.""" |
| return text[:max_chars] |
|
|
|
|
| def format_conversation(messages: list[dict[str, str]]) -> str: |
| """Formatē sarunu kā vienu tekstu apmācībai.""" |
| parts = [] |
| for msg in messages: |
| role = msg.get("role", "user") |
| content = msg.get("content", "") |
| parts.append(f"<|{role}|>\n{content}\n<|end|>") |
| return "\n".join(parts) |
|
|
|
|
| def _preserve_block_text(value: Any) -> str: |
| return str(value or "").strip() |
|
|
|
|
| def _append_section(lines: list[str], title: str, value: Any) -> None: |
| if value is None: |
| return |
| if isinstance(value, str): |
| text = _preserve_block_text(value) |
| if text: |
| lines.append(f"{title}:\n{text}") |
| return |
| if isinstance(value, list): |
| items = [_preserve_block_text(item) for item in value if _preserve_block_text(item)] |
| if items: |
| lines.append(f"{title}:\n" + "\n".join(f"- {item}" for item in items)) |
| return |
| if isinstance(value, dict) and value: |
| lines.append(f"{title}:\n{json.dumps(value, ensure_ascii=False, indent=2, sort_keys=True)}") |
|
|
|
|
| def _format_structured_prompt_record(record: dict[str, Any]) -> str: |
| prompt = clean_text(str(record.get("prompt", ""))) |
| user_sections = [prompt] |
| section_fields = ( |
| ("Repo konteksts", record.get("repo_context")), |
| ("Mērķa fails", record.get("target_file")), |
| ("Esošais vai kļūdainais kods", record.get("buggy_code")), |
| ("Refactor vai diff konteksts", record.get("diff")), |
| ("Papildu konteksts", record.get("context")), |
| ("Pieņemšanas kritēriji", record.get("acceptance_criteria")), |
| ("Testi", record.get("tests")), |
| ("Robežgadījumi", record.get("edge_cases")), |
| ) |
| for title, value in section_fields: |
| _append_section(user_sections, title, value) |
| metadata = record.get("metadata") |
| if metadata: |
| _append_section(user_sections, "Metadata", metadata) |
|
|
| messages = [ |
| {"role": "user", "content": "\n\n".join(section for section in user_sections if section)} |
| ] |
| completion = _preserve_block_text(record.get("completion")) |
| if completion: |
| messages.append({"role": "assistant", "content": completion}) |
| elif metadata: |
| messages.append( |
| { |
| "role": "assistant", |
| "content": json.dumps(metadata, ensure_ascii=False, sort_keys=True), |
| } |
| ) |
| return format_conversation(messages) |
|
|
|
|
| def record_to_training_text(record: dict[str, Any], max_chars: int = 4096) -> str: |
| """Pārveido vienu HF dataset ierakstu uz tekstu kauzālai apmācībai.""" |
| if "text" in record and isinstance(record["text"], str): |
| return truncate(clean_text(record["text"]), max_chars=max_chars) |
|
|
| if "user" in record or "assistant" in record: |
| messages = [ |
| {"role": "user", "content": clean_text(str(record.get("user", "")))}, |
| {"role": "assistant", "content": clean_text(str(record.get("assistant", "")))}, |
| ] |
| return truncate(format_conversation(messages), max_chars=max_chars) |
|
|
| if "prompt" in record: |
| return truncate(_format_structured_prompt_record(record), max_chars=max_chars) |
|
|
| serialized = json.dumps(record, ensure_ascii=False, sort_keys=True) |
| return truncate(clean_text(serialized), max_chars=max_chars) |
|
|