File size: 1,416 Bytes
c3a047c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os
import pandas as pd
import json

DATA_DIR = "./training/data"
os.makedirs(DATA_DIR, exist_ok=True)

def save_uploaded_file(file, filename):
    """Save uploaded file to data folder"""
    path = os.path.join(DATA_DIR, filename)
    with open(path, "wb") as f:
        f.write(file.read())
    return path

def convert_to_jsonl(file_path, src_col="src", tgt_col="tgt"):
    """Detect file type (csv, xlsx, tsv, jsonl) and normalize to JSONL"""
    ext = os.path.splitext(file_path)[-1].lower()
    data = None

    if ext == ".csv":
        data = pd.read_csv(file_path)
    elif ext == ".xlsx":
        data = pd.read_excel(file_path)
    elif ext == ".tsv":
        data = pd.read_csv(file_path, sep="\t")
    elif ext == ".jsonl":
        return file_path  # already JSONL
    else:
        raise ValueError("Unsupported file format")

    # Ensure we have two columns: src (Hausa) and tgt (English)
    if len(data.columns) < 2:
        raise ValueError("Dataset must have at least two columns")

    data = data.rename(columns={data.columns[0]: "src", data.columns[1]: "tgt"})
    jsonl_path = file_path.rsplit(".", 1)[0] + ".jsonl"

    with open(jsonl_path, "w", encoding="utf-8") as f:
        for _, row in data.iterrows():
            f.write(json.dumps({"src": str(row["src"]), "tgt": str(row["tgt"])}, ensure_ascii=False) + "\n")

    return jsonl_path