Spaces:
Sleeping
Sleeping
| import os | |
| from fastapi import FastAPI | |
| from datasets import load_dataset | |
| import random | |
| import json | |
| import re | |
| # Set Hugging Face cache directory programmatically as a fallback | |
| os.environ["HF_HOME"] = "/app/cache" | |
| app = FastAPI() | |
| # Load the dataset from Hugging Face, selecting required columns including sentence_type | |
| dataset = load_dataset("UCSC-Admire/idiom-SFT-dataset-561-2024-12-06_00-40-30", split="train").select_columns(["compound", "sentence", "output", "sentence_type"]) | |
| # Filter dataset to include only records with sentence_type = "idiomatic" | |
| dataset = dataset.filter(lambda x: x["sentence_type"] == "idiomatic") | |
| async def get_idioms(): | |
| idioms = random.sample(list(dataset), 50) | |
| response = [] | |
| for item in idioms: | |
| raw_output = item.get("output", "") | |
| # Strip <output> tags using regex | |
| cleaned_output = re.sub(r"</?output>", "", raw_output).strip() | |
| # Try parsing the JSON to get 'Compound Meaning' | |
| try: | |
| output_json = json.loads(cleaned_output) | |
| compound_meaning = output_json.get("Compound Meaning", "") | |
| except json.JSONDecodeError: | |
| compound_meaning = "" # Fallback if JSON is invalid | |
| response.append({ | |
| "idiom": item.get("compound", ""), | |
| "example": item.get("sentence", ""), | |
| "definition": compound_meaning, | |
| }) | |
| return response |