MarisUK's picture
Maris AI model sync
f440f03 verified
"""Datu priekšapstrāde."""
from __future__ import annotations
import json
import re
from typing import Any
def clean_text(text: str) -> str:
"""Notīra tekstu no nevēlamiem simboliem."""
text = re.sub(r"\s+", " ", text)
text = text.strip()
return text
def truncate(text: str, max_chars: int = 4096) -> str:
"""Apgriež tekstu līdz max_chars."""
return text[:max_chars]
def format_conversation(messages: list[dict[str, str]]) -> str:
"""Formatē sarunu kā vienu tekstu apmācībai."""
parts = []
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
parts.append(f"<|{role}|>\n{content}\n<|end|>")
return "\n".join(parts)
def _preserve_block_text(value: Any) -> str:
return str(value or "").strip()
def _append_section(lines: list[str], title: str, value: Any) -> None:
if value is None:
return
if isinstance(value, str):
text = _preserve_block_text(value)
if text:
lines.append(f"{title}:\n{text}")
return
if isinstance(value, list):
items = [_preserve_block_text(item) for item in value if _preserve_block_text(item)]
if items:
lines.append(f"{title}:\n" + "\n".join(f"- {item}" for item in items))
return
if isinstance(value, dict) and value:
lines.append(f"{title}:\n{json.dumps(value, ensure_ascii=False, indent=2, sort_keys=True)}")
def _format_structured_prompt_record(record: dict[str, Any]) -> str:
prompt = clean_text(str(record.get("prompt", "")))
user_sections = [prompt]
section_fields = (
("Repo konteksts", record.get("repo_context")),
("Mērķa fails", record.get("target_file")),
("Esošais vai kļūdainais kods", record.get("buggy_code")),
("Refactor vai diff konteksts", record.get("diff")),
("Papildu konteksts", record.get("context")),
("Pieņemšanas kritēriji", record.get("acceptance_criteria")),
("Testi", record.get("tests")),
("Robežgadījumi", record.get("edge_cases")),
)
for title, value in section_fields:
_append_section(user_sections, title, value)
metadata = record.get("metadata")
if metadata:
_append_section(user_sections, "Metadata", metadata)
messages = [
{"role": "user", "content": "\n\n".join(section for section in user_sections if section)}
]
completion = _preserve_block_text(record.get("completion"))
if completion:
messages.append({"role": "assistant", "content": completion})
elif metadata:
messages.append(
{
"role": "assistant",
"content": json.dumps(metadata, ensure_ascii=False, sort_keys=True),
}
)
return format_conversation(messages)
def record_to_training_text(record: dict[str, Any], max_chars: int = 4096) -> str:
"""Pārveido vienu HF dataset ierakstu uz tekstu kauzālai apmācībai."""
if "text" in record and isinstance(record["text"], str):
return truncate(clean_text(record["text"]), max_chars=max_chars)
if "user" in record or "assistant" in record:
messages = [
{"role": "user", "content": clean_text(str(record.get("user", "")))},
{"role": "assistant", "content": clean_text(str(record.get("assistant", "")))},
]
return truncate(format_conversation(messages), max_chars=max_chars)
if "prompt" in record:
return truncate(_format_structured_prompt_record(record), max_chars=max_chars)
serialized = json.dumps(record, ensure_ascii=False, sort_keys=True)
return truncate(clean_text(serialized), max_chars=max_chars)