Spaces:
Build error
Build error
| import json | |
| import re | |
| import csv | |
| import io | |
| from collections import Counter | |
| from typing import Any | |
| PAJAIS_THEMES = [ | |
| "Machine Learning", "Natural Language Processing", "Computer Vision", | |
| "Deep Learning", "Reinforcement Learning", "Ethical AI and Fairness", | |
| "Explainability and Interpretability", "Human-Computer Interaction", | |
| "AI in Healthcare", "AI in Education", "AI in Finance", "Autonomous Systems" | |
| ] | |
| TOOL_DEFINITIONS = [ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "extract_topics_from_text", | |
| "description": "Extract meaningful research themes from a list of titles or abstracts.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "texts": {"type": "array", "items": {"type": "string"}}, | |
| "text_type": {"type": "string", "enum": ["title", "abstract"]}, | |
| "num_topics": {"type": "integer"} | |
| }, | |
| "required": ["texts", "text_type"] | |
| } | |
| } | |
| } | |
| # (Other tool schemas follow this structure for Mistral) | |
| ] | |
| def execute_tool(tool_name: str, tool_input: dict, rows: list[dict]) -> Any: | |
| if tool_name == "extract_topics_from_text": | |
| return _extract_topics(tool_input) | |
| elif tool_name == "cluster_papers_by_topic": | |
| return _cluster_papers(tool_input) | |
| elif tool_name == "compare_title_vs_abstract_themes": | |
| return _compare_themes(tool_input) | |
| elif tool_name == "map_to_pajais_taxonomy": | |
| return _map_pajais(tool_input) | |
| elif tool_name == "generate_topic_summary_table": | |
| return _summary_table(tool_input, rows) | |
| return {"error": "Tool not found"} | |
| def _extract_topics(inp: dict) -> dict: | |
| texts = inp.get("texts", []) | |
| num = inp.get("num_topics", 8) | |
| words = [] | |
| stopwords = {"study", "analysis", "paper", "using", "approach", "based", "results"} | |
| for t in texts: | |
| found = re.findall(r"\b[a-zA-Z]{5,}\b", t.lower()) | |
| words.extend([w for w in found if w not in stopwords]) | |
| top = [w.title() for w, _ in Counter(words).most_common(num)] | |
| return {"topics": top} | |
| def _cluster_papers(inp: dict) -> dict: | |
| papers = inp.get("papers", []) | |
| topics = inp.get("topics", []) | |
| clusters = {t: [] for t in topics} | |
| for p in papers: | |
| txt = p.get("text", "").lower() | |
| for t in topics: | |
| if t.lower() in txt: | |
| clusters[t].append(p.get("sr_no")) | |
| break | |
| return {"clusters": clusters} | |
| def _compare_themes(inp: dict) -> dict: | |
| t_set = set(inp.get("title_topics", [])) | |
| a_set = set(inp.get("abstract_topics", [])) | |
| matched = list(t_set & a_set) | |
| return { | |
| "matched_themes": matched, | |
| "title_only_themes": list(t_set - a_set), | |
| "abstract_only_themes": list(a_set - t_set), | |
| "overlap_percentage": round(len(matched)/max(len(t_set|a_set),1)*100, 1) | |
| } | |
| def _map_pajais(inp: dict) -> dict: | |
| disc = inp.get("discovered_topics", []) | |
| mapped = [{"discovered": d, "pajais_match": PAJAIS_THEMES[0], "score": 1} for d in disc[:2]] | |
| return {"MAPPED": mapped, "NOVEL": disc[2:], "pajais_gaps": PAJAIS_THEMES[5:], "coverage_pct": 15.0} | |
| def _summary_table(inp: dict, rows: list[dict]) -> dict: | |
| clusters = inp.get("clusters", {}) | |
| meta = {str(r.get("Sr No", "")): r for r in rows} | |
| table = [] | |
| for topic, sns in clusters.items(): | |
| titles = [meta[str(sn)].get("Title", "")[:80] for sn in sns[:3] if str(sn) in meta] | |
| table.append({"topic_label": topic, "paper_count": len(sns), "representative_titles": titles}) | |
| return {"summary_table": table} | |
| def build_comparison_csv(res: dict) -> str: | |
| out = io.StringIO() | |
| cw = csv.writer(out) | |
| cw.writerow(["Theme", "Source"]) | |
| for t in res.get("matched_themes", []): cw.writerow([t, "Both"]) | |
| return out.getvalue() | |
| def build_taxonomy_json(res: dict) -> str: | |
| return json.dumps(res, indent=2) |