Spaces:

st192011
/

PANINI-SLA

Sleeping

App Files Files Community

PANINI-SLA / app.py

st192011

Update app.py

106cf35 verified 4 months ago

raw

history blame contribute delete

15.1 kB

	import os
	import re
	import json
	import random
	import pandas as pd
	import xml.etree.ElementTree as ET
	import gradio as gr
	from datetime import datetime
	from huggingface_hub import InferenceClient, HfApi, hf_hub_download

	# --- SETTINGS & AUTH ---
	HF_TOKEN = os.getenv("DATASET")
	DATASET_REPO = "st192011/PANINI-SLA"
	CSV_FILENAME = "learner_corpus_v5_morphology.csv"

	# --- EXPANDED MODEL LIST ---
	# Categorized for the user to understand trade-offs
	LLM_MODELS = {

	# --- The Speedsters (Instant generation, slightly less strict) ---
	"Qwen 2.5 7B (Alibaba) - Recommended": "Qwen/Qwen2.5-7B-Instruct",
	"Llama 3.2 3B (Meta)": "meta-llama/Llama-3.2-3B-Instruct",
	"Zephyr 7B (HuggingFace H4)": "HuggingFaceH4/zephyr-7b-beta",

	# --- The Powerhouses (Smartest, Slower) ---
	"Llama 3.3 70B (Meta)": "meta-llama/Llama-3.3-70B-Instruct",
	"Qwen 2.5 72B (Alibaba)": "Qwen/Qwen2.5-72B-Instruct",
	"Mixtral 8x7B (Mistral AI)": "mistralai/Mixtral-8x7B-Instruct-v0.1",

	# --- The Efficient Teachers (Fast, Good Logic) ---
	"Mistral Nemo 12B (Mistral AI)": "mistralai/Mistral-Nemo-Instruct-2407",
	"Gemma 2 9B (Google)": "google/gemma-2-9b-it",
	"Phi-4 (Microsoft)": "microsoft/Phi-4",

	}

	api = HfApi(token=HF_TOKEN)

	# --- 1. PROMPT DESIGN (Teacher) ---
	def get_system_prompt(language, level, topic, grammar):
	return f"""You are an expert Applied Linguist and Computational Linguist.

	TASK: Create a cohesive reading passage (8-12 sentences) in {language} ({level}) about '{topic}'.
	GRAMMAR FOCUS: {grammar} (The text must naturally require this grammar).

	OUTPUT FORMAT:
	1. First, provide the story in plain text.
	2. Second, provide the story in BNC-style XML.

	XML RULES:
	- Use <root> and <s> sentences.
	- Words: <w hw="LEMMA" pos="MAJOR_POS" c5="SPECIFIC_TAG">word</w>
	- CRITICAL: Use correct C5 tags for inflection:
	- Nouns: NN1 (Singular), NN2 (Plural)
	- Adjectives: AJ0 (Base), AJC (Comparative), AJS (Superlative)
	- Verbs: VVI (Inf), VVD (Past), VVG (Gerund), VVN (Past Part), VVZ (3rd Pers)
	- Prepositions: PRP

	Example:
	<root>
	<s>
	<w hw="be" pos="VERB" c5="VBD">was</w>
	<w hw="big" pos="ADJ" c5="AJC">bigger</w>
	<w hw="dog" pos="NOUN" c5="NN2">dogs</w>
	</s>
	</root>
	"""

	# --- 2. PARSING (Linguist) ---
	def parse_xml_to_sentences(xml_string):
	# Strip markdown code blocks often added by models
	xml_string = re.sub(r'```xml\|```', '', xml_string).strip()
	sentences = []
	try:
	# Robust Regex to find the root block even if there is chatter
	match = re.search(r'<root>.</root>\|<s>.</s>', xml_string, re.DOTALL)
	if match:
	content = match.group()
	if not content.startswith('<root>'): content = f"<root>{content}</root>"

	root = ET.fromstring(content)
	for s in root.findall('s'):
	words = []
	for child in s:
	if child.tag == 'w':
	words.append({
	"type": "word",
	"surface": child.text.strip() if child.text else "",
	"hw": child.get("hw", ""),
	"pos": child.get("pos", ""),
	"c5": child.get("c5", "")
	})
	elif child.tag == 'c':
	words.append({
	"type": "punct",
	"surface": child.text.strip() if child.text else ""
	})
	if words: sentences.append(words)
	except Exception as e:
	print(f"XML Parse Error: {e}")
	return sentences

	# --- 3. ADVANCED CHALLENGE LOGIC (The Proctor) ---
	def get_challenge(sentence_items, grammar_focus):
	candidates = []

	for i, item in enumerate(sentence_items):
	if item['type'] != 'word': continue

	surf = item['surface']
	lemma = item['hw']
	pos = item['pos']
	c5 = item['c5']

	is_inflected = surf.lower() != lemma.lower()

	# --- LOGIC RULES ---

	# 1. NOUN PLURALS
	if pos == 'NOUN' and c5 == 'NN2':
	candidates.append({
	"type": "Noun Pluralization", "index": i, "target": surf,
	"prompt": f"Plural form of '{lemma}'",
	"hw": lemma, "pos": pos, "c5": c5
	})

	# 2. ADJECTIVE DEGREES
	elif pos == 'ADJ':
	if c5 == 'AJC':
	candidates.append({
	"type": "Comparative Adjective", "index": i, "target": surf,
	"prompt": f"Comparative form of '{lemma}'",
	"hw": lemma, "pos": pos, "c5": c5
	})
	elif c5 == 'AJS':
	candidates.append({
	"type": "Superlative Adjective", "index": i, "target": surf,
	"prompt": f"Superlative form of '{lemma}'",
	"hw": lemma, "pos": pos, "c5": c5
	})

	# 3. VERB CONJUGATION
	elif pos == 'VERB' and is_inflected:
	candidates.append({
	"type": "Verb Conjugation", "index": i, "target": surf,
	"prompt": f"Correct form of verb '{lemma}'",
	"hw": lemma, "pos": pos, "c5": c5
	})

	# 4. PREPOSITIONS
	elif pos in ['PREP', 'PRP', 'ADP']:
	candidates.append({
	"type": "Preposition", "index": i, "target": surf,
	"prompt": "Correct Preposition",
	"hw": lemma, "pos": pos, "c5": c5
	})

	# --- FILTERING BASED ON USER FOCUS ---
	focus = grammar_focus.lower()

	if "plural" in focus or "noun" in focus:
	filtered = [c for c in candidates if "Noun" in c['type']]
	if filtered: return random.choice(filtered)

	if "comparative" in focus or "superlative" in focus or "adjective" in focus:
	filtered = [c for c in candidates if "Adjective" in c['type']]
	if filtered: return random.choice(filtered)

	if "verb" in focus or "tense" in focus:
	filtered = [c for c in candidates if "Verb" in c['type']]
	if filtered: return random.choice(filtered)

	if "prep" in focus:
	filtered = [c for c in candidates if "Preposition" in c['type']]
	if filtered: return random.choice(filtered)

	# Fallback
	return random.choice(candidates) if candidates else None

	# --- 4. PERSISTENCE ---
	def log_to_dataset(state, last_attempt):
	if not HF_TOKEN: return

	new_entry = {
	"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"model": state.get('model'),
	"user": state.get('user'),
	"l1": state.get('l1'),
	"l2": state.get('l2'),
	"level": state.get('level'),
	"topic": state.get('topic'),
	"grammar_focus": state.get('grammar'),
	"quiz_type": last_attempt.get('type'),
	"target_word": last_attempt.get('target'),
	"student_input": last_attempt.get('input'),
	"is_correct": last_attempt.get('correct'),
	"lemma": last_attempt.get('hw'),
	"pos": last_attempt.get('pos'),
	"c5_tag": last_attempt.get('c5')
	}

	try:
	try:
	path = hf_hub_download(repo_id=DATASET_REPO, filename=CSV_FILENAME, repo_type="dataset", token=HF_TOKEN)
	df = pd.read_csv(path)
	except:
	df = pd.DataFrame(columns=list(new_entry.keys()))

	df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
	api.upload_file(
	path_or_fileobj=df.to_csv(index=False).encode("utf-8"),
	path_in_repo=CSV_FILENAME,
	repo_id=DATASET_REPO,
	repo_type="dataset"
	)
	except Exception as e:
	print(f"Logging Error: {e}")

	# --- 5. GRADIO FLOW ---

	def generate_lesson_logic(model_key, user, l1, l2, level, topic, grammar, state):
	client = InferenceClient(LLM_MODELS[model_key], token=HF_TOKEN)
	sys_prompt = get_system_prompt(l2, level, topic, grammar)

	try:
	# FIX: We now explicitly set provider="hf-inference" to avoid 'ovhcloud' errors
	# If that fails, we fallback to 'auto'
	try:
	response = client.chat_completion(
	messages=[{"role": "user", "content": sys_prompt}],
	max_tokens=3500,
	stream=False,
	provider="hf-inference" # FORCE internal HF servers
	).choices[0].message.content
	except Exception:
	# Fallback if specific provider fails
	response = client.chat_completion(
	messages=[{"role": "user", "content": sys_prompt}],
	max_tokens=3500,
	stream=False
	).choices[0].message.content

	sentences = parse_xml_to_sentences(response)

	if not sentences:
	return "⚠️ Generation failed. The model did not output valid XML. Try 'Llama 3.3'.", "", "", state, gr.update(visible=False), gr.update(visible=False)

	# Reconstruct Preview Text
	preview_text = ""
	for s in sentences:
	for item in s:
	if item['type'] == 'word': preview_text += " " + item['surface']
	else: preview_text += item['surface']

	new_state = {
	"user": user, "l1": l1, "l2": l2, "level": level, "topic": topic, "grammar": grammar,
	"model": model_key, "sentences": sentences, "current_idx": 0, "score": 0, "history": []
	}

	return (
	f"✅ Lesson Generated: {len(sentences)} sentences found.",
	preview_text,
	response,
	new_state,
	gr.update(visible=True),
	gr.update(visible=False)
	)
	except Exception as e:
	return f"API Error: {str(e)}", "", "", state, gr.update(visible=False), gr.update(visible=False)

	def load_next_question(state):
	total_q = len(state['sentences'])

	if state['current_idx'] >= total_q:
	total_hist = len(state['history'])
	correct = state['score']
	pct = int((correct/total_hist)*100) if total_hist > 0 else 0

	df = pd.DataFrame(state['history'])
	if not df.empty:
	df = df[['target', 'input', 'correct', 'type', 'hw', 'pos', 'c5']]
	df.columns = ['Target', 'Your Input', 'Correct?', 'Question Type', 'Lemma', 'POS', 'C5 Tag']

	return (
	f"🎉 Session Complete! Score: {correct}/{total_hist} ({pct}%)",
	"", "",
	gr.update(visible=False),
	gr.update(visible=True),
	df
	)

	sentence_items = state['sentences'][state['current_idx']]
	challenge = get_challenge(sentence_items, state.get('grammar', ''))

	if not challenge:
	state['current_idx'] += 1
	return load_next_question(state)

	state['active_challenge'] = challenge

	display_text = ""
	for i, item in enumerate(sentence_items):
	if i == challenge['index']:
	display_text += " [______] "
	elif item['type'] == 'punct':
	display_text += item['surface']
	else:
	display_text += " " + item['surface']

	progress = f"Sentence {state['current_idx'] + 1} / {total_q}"
	return display_text.strip(), challenge['prompt'], progress, gr.update(visible=True), gr.update(visible=False), None

	def process_answer(user_input, state):
	challenge = state['active_challenge']
	is_correct = user_input.strip().lower() == challenge['target'].lower()

	if is_correct: state['score'] += 1

	attempt = {
	"type": challenge['type'], "target": challenge['target'],
	"input": user_input, "correct": is_correct,
	"hw": challenge['hw'], "pos": challenge['pos'], "c5": challenge['c5']
	}
	state['history'].append(attempt)
	log_to_dataset(state, attempt)

	state['current_idx'] += 1

	feedback = "✅ Correct!" if is_correct else f"❌ Answer: {challenge['target']} (Base: {challenge['hw']}, Tag: {challenge['c5']})"
	return feedback, state

	# --- UI LAYOUT ---

	with gr.Blocks(theme=gr.themes.Soft(primary_hue="teal")) as demo:
	gr.HTML("<h1 style='text-align: center; color: #0d9488;'>🎙️ PANINI SLA: Morphology & Syntax Lab</h1>")

	session_state = gr.State({})

	with gr.Tab("1. Teacher Console"):
	with gr.Row():
	in_model = gr.Dropdown(list(LLM_MODELS.keys()), label="AI Model", value=list(LLM_MODELS.keys())[0])
	in_user = gr.Textbox(label="Learner ID", value="student_01")

	with gr.Row():
	in_l1 = gr.Textbox(label="L1 (Native)", value="English")
	in_l2 = gr.Dropdown(["English", "French", "German", "Spanish", "Italian"], label="Target Language", value="English")
	in_level = gr.Radio(["A1", "A2", "B1", "B2", "C1"], label="CEFR Level", value="B1")

	with gr.Row():
	in_topic = gr.Textbox(label="Context", value="A Mystery Story")
	in_grammar = gr.Textbox(label="Grammar Focus", placeholder="e.g. Plural Nouns, Comparatives, Past Tense")

	btn_gen = gr.Button("Generate XML Lesson", variant="primary")
	status_msg = gr.Markdown()

	with gr.Accordion("🔍 View Linguistic Data", open=False):
	out_story_text = gr.Textbox(label="Story Text", interactive=False, lines=3)
	out_raw_xml = gr.Code(label="BNC XML", language="html", interactive=False)

	with gr.Tab("2. Student Lab", visible=False) as quiz_tab:
	with gr.Row():
	progress_disp = gr.Label("Progress: 0/0")

	sentence_disp = gr.Markdown("### Sentence...")
	prompt_disp = gr.Markdown("Prompt")

	with gr.Row():
	ans_in = gr.Textbox(label="Your Answer")
	btn_sub = gr.Button("Submit", variant="primary")

	feedback_disp = gr.Markdown()
	btn_next = gr.Button("Next Question")

	with gr.Tab("3. Linguistic Report", visible=False) as stats_tab:
	final_msg_disp = gr.Markdown("### Summary")
	results_df = gr.Dataframe(label="Detailed Analysis (Lemma, POS, C5)")
	btn_restart = gr.Button("Start New Session")

	# --- EVENTS ---
	btn_gen.click(
	generate_lesson_logic,
	[in_model, in_user, in_l1, in_l2, in_level, in_topic, in_grammar, session_state],
	[status_msg, out_story_text, out_raw_xml, session_state, quiz_tab, stats_tab]
	).then(
	load_next_question,
	session_state,
	[sentence_disp, prompt_disp, progress_disp, quiz_tab, stats_tab, results_df]
	)

	btn_sub.click(process_answer, [ans_in, session_state], [feedback_disp, session_state])
	btn_next.click(load_next_question, session_state, [sentence_disp, prompt_disp, progress_disp, quiz_tab, stats_tab, results_df])
	btn_restart.click(lambda: (gr.update(visible=True), gr.update(visible=False)), None, [quiz_tab, stats_tab])

	demo.launch(ssr_mode=False)