Spaces:

psugam
/

sanskrit-parser-api

Sleeping

App Files Files Community

psugam commited on Jan 5

Commit

6a1a740

verified ·

1 Parent(s): 61a4cfd

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -73

app.py CHANGED Viewed

@@ -1,74 +1,90 @@
-from flask import Flask, request, jsonify
-from flask_cors import CORS
-import process_sanskrit as ps
-import re
-app = Flask(__name__)
-CORS(app)
-def clean_dictionary_html(content):
-    if isinstance(content, str):
-        clean = re.sub(r'<[^>]*>', '', content)
-        return re.sub(r'\s+', ' ', clean).strip()
-    if isinstance(content, list):
-        return [clean_dictionary_html(item) for item in content]
-    if isinstance(content, dict):
-        return {k: clean_dictionary_html(v) for k, v in content.items()}
-    return str(content)
-@app.route('/split')
-def split_word():
-    word = request.args.get('word')
-    if not word: return jsonify({"error": "No word"}), 400
-    try:
-        # ps.split can return a list of lists: [['p1', 'p2', 'p3']]
-        split_result = ps.split(word)
-        # We want the most complete split (usually the first item in the list)
-        if split_result and isinstance(split_result[0], list):
-            components = split_result[0]
-        else:
-            components = split_result if split_result else [word]
-        # Filter out empty strings and check if it actually split
-        components = [c for c in components if c]
-        is_compound = len(components) > 1
-        return jsonify({"is_compound": is_compound, "components": components})
-    except Exception as e:
-        print(f"Split error: {e}")
-        return jsonify({"is_compound": False, "components": [word]})
-@app.route('/meaning')
-def get_meaning():
-    word = request.args.get('word')
-    try:
-        # If the word is an infinitive (like 'gantum'), ps.process
-        # should find the root 'gam'.
-        raw_results = ps.process(word, 'mw', 'ap90', 'cae', 'bhs')
-        if not raw_results:
-            return jsonify([])
-        final_output = []
-        for entry in raw_results:
-            # Check if entry has enough data
-            stem = entry[0]
-            word_type = entry[1]
-            grammar = entry[2]
-            # If the library returns 'indeclinable' for an infinitive,
-            # we make sure to pass that through.
-            final_output.append({
-                "stem": stem,
-                "type": word_type if word_type else "morphology",
-                "detected_tags": grammar if grammar else [["form recognized"]],
-                "definitions": clean_dictionary_html(entry[6])
-            })
-        return jsonify(final_output)
-    except Exception as e:
-        print(f"Meaning error: {e}")
-        return jsonify({"error": str(e)}), 500
-if __name__ == '__main__':
     app.run(debug=True)

+from flask import Flask, request, jsonify
+from flask_cors import CORS
+import process_sanskrit as ps
+import re
+import json
+app = Flask(__name__)
+CORS(app)
+def clean_definitions(content):
+    """
+    Recursively flattens the dictionary data.
+    Normalizes whitespace but PRESERVES HTML tags.
+    """
+    results = []
+    if isinstance(content, str):
+        # Normalize whitespace but do NOT strip <tags>
+        clean = re.sub(r'\s+', ' ', content).strip()
+        if clean:
+            results.append(clean)
+    elif isinstance(content, list):
+        for item in content:
+            results.extend(clean_definitions(item))
+    elif isinstance(content, dict):
+        for value in content.values():
+            results.extend(clean_definitions(value))
+    return results
+@app.route('/split')
+def split_word():
+    word = request.args.get('word')
+    if not word: return jsonify({"error": "No word"}), 400
+    try:
+        split_result = ps.split(word)
+        if split_result and isinstance(split_result[0], list):
+            components = split_result[0]
+        else:
+            components = split_result if split_result else [word]
+        components = [c for c in components if c]
+        is_compound = len(components) > 1
+        return jsonify({"is_compound": is_compound, "components": components})
+    except Exception as e:
+        return jsonify({"is_compound": False, "components": [word]})
+@app.route('/meaning')
+def get_meaning():
+    word = request.args.get('word')
+    try:
+        raw_results = ps.process(word, 'mw', 'ap90', 'cae', 'bhs')
+        if not raw_results:
+            return jsonify([])
+        grouped_results = {}
+        for entry in raw_results:
+            stem = entry[0]
+            word_type = entry[1] if entry[1] else "morphology"
+            grammar = entry[2] if entry[2] else [["form recognized"]]
+            dict_data = entry[6]
+            formatted_defs = {}
+            for source, content in dict_data.items():
+                cleaned = clean_definitions(content)
+                if cleaned:
+                    formatted_defs[source] = cleaned
+            def_key = json.dumps(formatted_defs, sort_keys=True)
+            if def_key in grouped_results:
+                existing = grouped_results[def_key]
+                for tag_set in grammar:
+                    if tag_set not in existing["detected_tags"]:
+                        existing["detected_tags"].append(tag_set)
+                if word_type not in existing["type"]:
+                    existing["type"] += f" / {word_type}"
+            else:
+                grouped_results[def_key] = {
+                    "stem": stem,
+                    "type": word_type,
+                    "detected_tags": grammar,
+                    "definitions": formatted_defs
+                }
+        return jsonify(list(grouped_results.values()))
+    except Exception as e:
+        print(f"Meaning error: {e}")
+        return jsonify({"error": str(e)}), 500
+if __name__ == '__main__':
     app.run(debug=True)