Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import json | |
| import random | |
| import pandas as pd | |
| import xml.etree.ElementTree as ET | |
| import gradio as gr | |
| from datetime import datetime | |
| from huggingface_hub import InferenceClient, HfApi, hf_hub_download | |
| # --- SETTINGS & AUTH --- | |
| HF_TOKEN = os.getenv("DATASET") | |
| DATASET_REPO = "st192011/PANINI-SLA" | |
| CSV_FILENAME = "learner_corpus_v5_morphology.csv" | |
| # --- EXPANDED MODEL LIST --- | |
| # Categorized for the user to understand trade-offs | |
| LLM_MODELS = { | |
| # --- The Speedsters (Instant generation, slightly less strict) --- | |
| "Qwen 2.5 7B (Alibaba) - Recommended": "Qwen/Qwen2.5-7B-Instruct", | |
| "Llama 3.2 3B (Meta)": "meta-llama/Llama-3.2-3B-Instruct", | |
| "Zephyr 7B (HuggingFace H4)": "HuggingFaceH4/zephyr-7b-beta", | |
| # --- The Powerhouses (Smartest, Slower) --- | |
| "Llama 3.3 70B (Meta)": "meta-llama/Llama-3.3-70B-Instruct", | |
| "Qwen 2.5 72B (Alibaba)": "Qwen/Qwen2.5-72B-Instruct", | |
| "Mixtral 8x7B (Mistral AI)": "mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| # --- The Efficient Teachers (Fast, Good Logic) --- | |
| "Mistral Nemo 12B (Mistral AI)": "mistralai/Mistral-Nemo-Instruct-2407", | |
| "Gemma 2 9B (Google)": "google/gemma-2-9b-it", | |
| "Phi-4 (Microsoft)": "microsoft/Phi-4", | |
| } | |
| api = HfApi(token=HF_TOKEN) | |
| # --- 1. PROMPT DESIGN (Teacher) --- | |
| def get_system_prompt(language, level, topic, grammar): | |
| return f"""You are an expert Applied Linguist and Computational Linguist. | |
| TASK: Create a cohesive reading passage (8-12 sentences) in {language} ({level}) about '{topic}'. | |
| GRAMMAR FOCUS: {grammar} (The text must naturally require this grammar). | |
| OUTPUT FORMAT: | |
| 1. First, provide the story in plain text. | |
| 2. Second, provide the story in BNC-style XML. | |
| XML RULES: | |
| - Use <root> and <s> sentences. | |
| - Words: <w hw="LEMMA" pos="MAJOR_POS" c5="SPECIFIC_TAG">word</w> | |
| - CRITICAL: Use correct C5 tags for inflection: | |
| - Nouns: NN1 (Singular), NN2 (Plural) | |
| - Adjectives: AJ0 (Base), AJC (Comparative), AJS (Superlative) | |
| - Verbs: VVI (Inf), VVD (Past), VVG (Gerund), VVN (Past Part), VVZ (3rd Pers) | |
| - Prepositions: PRP | |
| Example: | |
| <root> | |
| <s> | |
| <w hw="be" pos="VERB" c5="VBD">was</w> | |
| <w hw="big" pos="ADJ" c5="AJC">bigger</w> | |
| <w hw="dog" pos="NOUN" c5="NN2">dogs</w> | |
| </s> | |
| </root> | |
| """ | |
| # --- 2. PARSING (Linguist) --- | |
| def parse_xml_to_sentences(xml_string): | |
| # Strip markdown code blocks often added by models | |
| xml_string = re.sub(r'```xml|```', '', xml_string).strip() | |
| sentences = [] | |
| try: | |
| # Robust Regex to find the root block even if there is chatter | |
| match = re.search(r'<root>.*</root>|<s>.*</s>', xml_string, re.DOTALL) | |
| if match: | |
| content = match.group() | |
| if not content.startswith('<root>'): content = f"<root>{content}</root>" | |
| root = ET.fromstring(content) | |
| for s in root.findall('s'): | |
| words = [] | |
| for child in s: | |
| if child.tag == 'w': | |
| words.append({ | |
| "type": "word", | |
| "surface": child.text.strip() if child.text else "", | |
| "hw": child.get("hw", ""), | |
| "pos": child.get("pos", ""), | |
| "c5": child.get("c5", "") | |
| }) | |
| elif child.tag == 'c': | |
| words.append({ | |
| "type": "punct", | |
| "surface": child.text.strip() if child.text else "" | |
| }) | |
| if words: sentences.append(words) | |
| except Exception as e: | |
| print(f"XML Parse Error: {e}") | |
| return sentences | |
| # --- 3. ADVANCED CHALLENGE LOGIC (The Proctor) --- | |
| def get_challenge(sentence_items, grammar_focus): | |
| candidates = [] | |
| for i, item in enumerate(sentence_items): | |
| if item['type'] != 'word': continue | |
| surf = item['surface'] | |
| lemma = item['hw'] | |
| pos = item['pos'] | |
| c5 = item['c5'] | |
| is_inflected = surf.lower() != lemma.lower() | |
| # --- LOGIC RULES --- | |
| # 1. NOUN PLURALS | |
| if pos == 'NOUN' and c5 == 'NN2': | |
| candidates.append({ | |
| "type": "Noun Pluralization", "index": i, "target": surf, | |
| "prompt": f"Plural form of '{lemma}'", | |
| "hw": lemma, "pos": pos, "c5": c5 | |
| }) | |
| # 2. ADJECTIVE DEGREES | |
| elif pos == 'ADJ': | |
| if c5 == 'AJC': | |
| candidates.append({ | |
| "type": "Comparative Adjective", "index": i, "target": surf, | |
| "prompt": f"Comparative form of '{lemma}'", | |
| "hw": lemma, "pos": pos, "c5": c5 | |
| }) | |
| elif c5 == 'AJS': | |
| candidates.append({ | |
| "type": "Superlative Adjective", "index": i, "target": surf, | |
| "prompt": f"Superlative form of '{lemma}'", | |
| "hw": lemma, "pos": pos, "c5": c5 | |
| }) | |
| # 3. VERB CONJUGATION | |
| elif pos == 'VERB' and is_inflected: | |
| candidates.append({ | |
| "type": "Verb Conjugation", "index": i, "target": surf, | |
| "prompt": f"Correct form of verb '{lemma}'", | |
| "hw": lemma, "pos": pos, "c5": c5 | |
| }) | |
| # 4. PREPOSITIONS | |
| elif pos in ['PREP', 'PRP', 'ADP']: | |
| candidates.append({ | |
| "type": "Preposition", "index": i, "target": surf, | |
| "prompt": "Correct Preposition", | |
| "hw": lemma, "pos": pos, "c5": c5 | |
| }) | |
| # --- FILTERING BASED ON USER FOCUS --- | |
| focus = grammar_focus.lower() | |
| if "plural" in focus or "noun" in focus: | |
| filtered = [c for c in candidates if "Noun" in c['type']] | |
| if filtered: return random.choice(filtered) | |
| if "comparative" in focus or "superlative" in focus or "adjective" in focus: | |
| filtered = [c for c in candidates if "Adjective" in c['type']] | |
| if filtered: return random.choice(filtered) | |
| if "verb" in focus or "tense" in focus: | |
| filtered = [c for c in candidates if "Verb" in c['type']] | |
| if filtered: return random.choice(filtered) | |
| if "prep" in focus: | |
| filtered = [c for c in candidates if "Preposition" in c['type']] | |
| if filtered: return random.choice(filtered) | |
| # Fallback | |
| return random.choice(candidates) if candidates else None | |
| # --- 4. PERSISTENCE --- | |
| def log_to_dataset(state, last_attempt): | |
| if not HF_TOKEN: return | |
| new_entry = { | |
| "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| "model": state.get('model'), | |
| "user": state.get('user'), | |
| "l1": state.get('l1'), | |
| "l2": state.get('l2'), | |
| "level": state.get('level'), | |
| "topic": state.get('topic'), | |
| "grammar_focus": state.get('grammar'), | |
| "quiz_type": last_attempt.get('type'), | |
| "target_word": last_attempt.get('target'), | |
| "student_input": last_attempt.get('input'), | |
| "is_correct": last_attempt.get('correct'), | |
| "lemma": last_attempt.get('hw'), | |
| "pos": last_attempt.get('pos'), | |
| "c5_tag": last_attempt.get('c5') | |
| } | |
| try: | |
| try: | |
| path = hf_hub_download(repo_id=DATASET_REPO, filename=CSV_FILENAME, repo_type="dataset", token=HF_TOKEN) | |
| df = pd.read_csv(path) | |
| except: | |
| df = pd.DataFrame(columns=list(new_entry.keys())) | |
| df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True) | |
| api.upload_file( | |
| path_or_fileobj=df.to_csv(index=False).encode("utf-8"), | |
| path_in_repo=CSV_FILENAME, | |
| repo_id=DATASET_REPO, | |
| repo_type="dataset" | |
| ) | |
| except Exception as e: | |
| print(f"Logging Error: {e}") | |
| # --- 5. GRADIO FLOW --- | |
| def generate_lesson_logic(model_key, user, l1, l2, level, topic, grammar, state): | |
| client = InferenceClient(LLM_MODELS[model_key], token=HF_TOKEN) | |
| sys_prompt = get_system_prompt(l2, level, topic, grammar) | |
| try: | |
| # FIX: We now explicitly set provider="hf-inference" to avoid 'ovhcloud' errors | |
| # If that fails, we fallback to 'auto' | |
| try: | |
| response = client.chat_completion( | |
| messages=[{"role": "user", "content": sys_prompt}], | |
| max_tokens=3500, | |
| stream=False, | |
| provider="hf-inference" # FORCE internal HF servers | |
| ).choices[0].message.content | |
| except Exception: | |
| # Fallback if specific provider fails | |
| response = client.chat_completion( | |
| messages=[{"role": "user", "content": sys_prompt}], | |
| max_tokens=3500, | |
| stream=False | |
| ).choices[0].message.content | |
| sentences = parse_xml_to_sentences(response) | |
| if not sentences: | |
| return "β οΈ Generation failed. The model did not output valid XML. Try 'Llama 3.3'.", "", "", state, gr.update(visible=False), gr.update(visible=False) | |
| # Reconstruct Preview Text | |
| preview_text = "" | |
| for s in sentences: | |
| for item in s: | |
| if item['type'] == 'word': preview_text += " " + item['surface'] | |
| else: preview_text += item['surface'] | |
| new_state = { | |
| "user": user, "l1": l1, "l2": l2, "level": level, "topic": topic, "grammar": grammar, | |
| "model": model_key, "sentences": sentences, "current_idx": 0, "score": 0, "history": [] | |
| } | |
| return ( | |
| f"β Lesson Generated: {len(sentences)} sentences found.", | |
| preview_text, | |
| response, | |
| new_state, | |
| gr.update(visible=True), | |
| gr.update(visible=False) | |
| ) | |
| except Exception as e: | |
| return f"API Error: {str(e)}", "", "", state, gr.update(visible=False), gr.update(visible=False) | |
| def load_next_question(state): | |
| total_q = len(state['sentences']) | |
| if state['current_idx'] >= total_q: | |
| total_hist = len(state['history']) | |
| correct = state['score'] | |
| pct = int((correct/total_hist)*100) if total_hist > 0 else 0 | |
| df = pd.DataFrame(state['history']) | |
| if not df.empty: | |
| df = df[['target', 'input', 'correct', 'type', 'hw', 'pos', 'c5']] | |
| df.columns = ['Target', 'Your Input', 'Correct?', 'Question Type', 'Lemma', 'POS', 'C5 Tag'] | |
| return ( | |
| f"π Session Complete! Score: {correct}/{total_hist} ({pct}%)", | |
| "", "", | |
| gr.update(visible=False), | |
| gr.update(visible=True), | |
| df | |
| ) | |
| sentence_items = state['sentences'][state['current_idx']] | |
| challenge = get_challenge(sentence_items, state.get('grammar', '')) | |
| if not challenge: | |
| state['current_idx'] += 1 | |
| return load_next_question(state) | |
| state['active_challenge'] = challenge | |
| display_text = "" | |
| for i, item in enumerate(sentence_items): | |
| if i == challenge['index']: | |
| display_text += " [______] " | |
| elif item['type'] == 'punct': | |
| display_text += item['surface'] | |
| else: | |
| display_text += " " + item['surface'] | |
| progress = f"Sentence {state['current_idx'] + 1} / {total_q}" | |
| return display_text.strip(), challenge['prompt'], progress, gr.update(visible=True), gr.update(visible=False), None | |
| def process_answer(user_input, state): | |
| challenge = state['active_challenge'] | |
| is_correct = user_input.strip().lower() == challenge['target'].lower() | |
| if is_correct: state['score'] += 1 | |
| attempt = { | |
| "type": challenge['type'], "target": challenge['target'], | |
| "input": user_input, "correct": is_correct, | |
| "hw": challenge['hw'], "pos": challenge['pos'], "c5": challenge['c5'] | |
| } | |
| state['history'].append(attempt) | |
| log_to_dataset(state, attempt) | |
| state['current_idx'] += 1 | |
| feedback = "β Correct!" if is_correct else f"β Answer: {challenge['target']} (Base: {challenge['hw']}, Tag: {challenge['c5']})" | |
| return feedback, state | |
| # --- UI LAYOUT --- | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="teal")) as demo: | |
| gr.HTML("<h1 style='text-align: center; color: #0d9488;'>ποΈ PANINI SLA: Morphology & Syntax Lab</h1>") | |
| session_state = gr.State({}) | |
| with gr.Tab("1. Teacher Console"): | |
| with gr.Row(): | |
| in_model = gr.Dropdown(list(LLM_MODELS.keys()), label="AI Model", value=list(LLM_MODELS.keys())[0]) | |
| in_user = gr.Textbox(label="Learner ID", value="student_01") | |
| with gr.Row(): | |
| in_l1 = gr.Textbox(label="L1 (Native)", value="English") | |
| in_l2 = gr.Dropdown(["English", "French", "German", "Spanish", "Italian"], label="Target Language", value="English") | |
| in_level = gr.Radio(["A1", "A2", "B1", "B2", "C1"], label="CEFR Level", value="B1") | |
| with gr.Row(): | |
| in_topic = gr.Textbox(label="Context", value="A Mystery Story") | |
| in_grammar = gr.Textbox(label="Grammar Focus", placeholder="e.g. Plural Nouns, Comparatives, Past Tense") | |
| btn_gen = gr.Button("Generate XML Lesson", variant="primary") | |
| status_msg = gr.Markdown() | |
| with gr.Accordion("π View Linguistic Data", open=False): | |
| out_story_text = gr.Textbox(label="Story Text", interactive=False, lines=3) | |
| out_raw_xml = gr.Code(label="BNC XML", language="html", interactive=False) | |
| with gr.Tab("2. Student Lab", visible=False) as quiz_tab: | |
| with gr.Row(): | |
| progress_disp = gr.Label("Progress: 0/0") | |
| sentence_disp = gr.Markdown("### Sentence...") | |
| prompt_disp = gr.Markdown("Prompt") | |
| with gr.Row(): | |
| ans_in = gr.Textbox(label="Your Answer") | |
| btn_sub = gr.Button("Submit", variant="primary") | |
| feedback_disp = gr.Markdown() | |
| btn_next = gr.Button("Next Question") | |
| with gr.Tab("3. Linguistic Report", visible=False) as stats_tab: | |
| final_msg_disp = gr.Markdown("### Summary") | |
| results_df = gr.Dataframe(label="Detailed Analysis (Lemma, POS, C5)") | |
| btn_restart = gr.Button("Start New Session") | |
| # --- EVENTS --- | |
| btn_gen.click( | |
| generate_lesson_logic, | |
| [in_model, in_user, in_l1, in_l2, in_level, in_topic, in_grammar, session_state], | |
| [status_msg, out_story_text, out_raw_xml, session_state, quiz_tab, stats_tab] | |
| ).then( | |
| load_next_question, | |
| session_state, | |
| [sentence_disp, prompt_disp, progress_disp, quiz_tab, stats_tab, results_df] | |
| ) | |
| btn_sub.click(process_answer, [ans_in, session_state], [feedback_disp, session_state]) | |
| btn_next.click(load_next_question, session_state, [sentence_disp, prompt_disp, progress_disp, quiz_tab, stats_tab, results_df]) | |
| btn_restart.click(lambda: (gr.update(visible=True), gr.update(visible=False)), None, [quiz_tab, stats_tab]) | |
| demo.launch(ssr_mode=False) |