File size: 1,417 Bytes
f232d53
7ec4963
d7ab29f
67a207f
d7ab29f
 
67a207f
f232d53
 
 
7ec4963
67a207f
d7ab29f
 
770e3a9
d7ab29f
 
67a207f
7ec4963
 
d7ab29f
67a207f
20c79ed
67a207f
d7ab29f
 
 
 
 
 
 
 
 
 
 
67a207f
d7ab29f
 
 
67a207f
20c79ed
39cbc9d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import os
from fastapi import FastAPI
from datasets import load_dataset
import random
import json
import re

# Set Hugging Face cache directory programmatically as a fallback
os.environ["HF_HOME"] = "/app/cache"

app = FastAPI()

# Load the dataset from Hugging Face, selecting required columns including sentence_type
dataset = load_dataset("UCSC-Admire/idiom-SFT-dataset-561-2024-12-06_00-40-30", split="train").select_columns(["compound", "sentence", "output", "sentence_type"])

# Filter dataset to include only records with sentence_type = "idiomatic"
dataset = dataset.filter(lambda x: x["sentence_type"] == "idiomatic")

@app.get("/api/idioms")
async def get_idioms():
    idioms = random.sample(list(dataset), 50)
    response = []

    for item in idioms:
        raw_output = item.get("output", "")
        # Strip <output> tags using regex
        cleaned_output = re.sub(r"</?output>", "", raw_output).strip()

        # Try parsing the JSON to get 'Compound Meaning'
        try:
            output_json = json.loads(cleaned_output)
            compound_meaning = output_json.get("Compound Meaning", "")
        except json.JSONDecodeError:
            compound_meaning = ""  # Fallback if JSON is invalid

        response.append({
            "idiom": item.get("compound", ""),
            "example": item.get("sentence", ""),
            "definition": compound_meaning,
        })

    return response