Spaces:

kachaf
/

sarf

Sleeping

App Files Files Community

sarf / app.py

Boulbaba

Upload app.py

b193db3 verified 8 days ago

raw

history blame contribute delete

4.71 kB

	from flask import Flask, request, Response
	from pyaramorph import Analyzer
	import json
	import os

	app = Flask(__name__)
	analyzer = Analyzer()

	# Arabic diacritics / characters used for arudi normalization
	SHORT_VOWELS = "\u064e\u064f\u0650" # َ ُ ِ
	TANWEEN = "\u064b\u064c\u064d" # ً ٌ ٍ
	SUKUN = "\u0652" # ْ
	SHADDA = "\u0651" # ّ
	LONG_VOWELS = {
	"ا": "M",
	"ى": "M",
	"آ": "M",
	"و": "M",
	"ي": "M",
	}
	VALID_NEEDS = {"sarf", "aroudh"}


	def json_response(payload, status=200):
	return Response(
	json.dumps(payload, ensure_ascii=False),
	status=status,
	content_type="application/json; charset=utf-8",
	)


	def extract_solution_only(solution_text: str) -> str:
	if not solution_text:
	return ""

	first_line = solution_text.strip().splitlines()[0].strip()
	if first_line.startswith("solution:"):
	return first_line.replace("solution:", "", 1).strip()

	return first_line



	def build_arudi_key(word: str) -> str:
	"""
	Build a metrical/prosodic key for deduplication.

	Rules:
	- fatha/damma/kasra are considered equivalent -> V
	- tanween has a special status and is preserved distinctly -> VN
	- long vowels (ا، و، ي، ى، آ) are considered equivalent -> M
	- sukun is preserved -> S
	- shadda is preserved -> D
	- consonants are preserved as-is
	"""
	normalized = []

	for ch in word:
	if ch in SHORT_VOWELS:
	normalized.append("V")
	elif ch in TANWEEN:
	normalized.append("VN")
	elif ch == SUKUN:
	normalized.append("S")
	elif ch == SHADDA:
	normalized.append("D")
	elif ch in LONG_VOWELS:
	normalized.append(LONG_VOWELS[ch])
	else:
	normalized.append(ch)

	return "".join(normalized)



	def format_sarf_results(results):
	formatted_results = []

	for word_block in results:
	if not word_block:
	continue

	word_header = word_block[0].replace("analysis for: ", "").strip()
	solutions = [solution_text.strip() for solution_text in word_block[1:] if solution_text.strip()]

	formatted_results.append({
	"word": word_header,
	"solutions": solutions,
	})

	return formatted_results



	def format_aroudh_results(results):
	formatted_results = []

	for word_block in results:
	if not word_block:
	continue

	word_header = word_block[0].replace("analysis for: ", "").strip()

	unique_solutions = []
	seen_arudi_keys = set()

	for solution_text in word_block[1:]:
	clean_solution = extract_solution_only(solution_text)
	if not clean_solution:
	continue

	arudi_key = build_arudi_key(clean_solution)
	if arudi_key in seen_arudi_keys:
	continue

	seen_arudi_keys.add(arudi_key)
	unique_solutions.append({"solution": clean_solution})

	formatted_results.append({
	"word": word_header,
	"solutions": unique_solutions,
	})

	return formatted_results


	@app.route("/", methods=["GET"])
	def home():
	return json_response({
	"success": True,
	"message": "PyAraMorph API is running",
	"supported_needs": ["sarf", "aroudh"],
	"default_need": "sarf",
	})


	@app.route("/health", methods=["GET"])
	def health():
	return json_response({"success": True, "status": "ok"})


	@app.route("/analyze", methods=["POST"])
	def analyze():
	data = request.get_json(silent=True) or {}
	text = (data.get("text") or "").strip()
	need = (data.get("need") or "sarf").strip().lower()

	if not text:
	return json_response({
	"success": False,
	"message": "text is required",
	}, 400)

	if need not in VALID_NEEDS:
	return json_response({
	"success": False,
	"message": 'need must be either "sarf" or "aroudh"',
	}, 400)

	try:
	results = analyzer.analyze_text(text)

	if need == "sarf":
	formatted_results = format_sarf_results(results)
	else:
	formatted_results = format_aroudh_results(results)

	return json_response({
	"success": True,
	"need": need,
	"text": text,
	"results": formatted_results,
	})

	except Exception as e:
	return json_response({
	"success": False,
	"message": str(e),
	}, 500)


	if __name__ == "__main__":
	port = int(os.environ.get("PORT", "7860"))
	app.run(host="0.0.0.0", port=port, debug=False)