Spaces:

PooryaPiroozfar
/

Persian_Semantic_Information_Extraction

Runtime error

App Files Files Community

PooryaPiroozfar commited on Feb 20

Commit

a1312ce

verified ·

1 Parent(s): a6a5bfb

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +22 -0
app.py +567 -0
final_frames.xlsx +0 -0
frame_triples2.xlsx +0 -0
requirements.txt +11 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+WORKDIR /app
+RUN python -m nltk.downloader punkt
+RUN python -c "import stanza; stanza.download('fa')"
+RUN apt-get update && apt-get install -y \
+    git \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --upgrade pip \
+    && pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,567 @@

+# -*- coding: utf-8 -*-
+"""Pipeline_LLM&Models.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1JImuJ0lMNHJ2zkt1iSWnhjn204ZgpPMM
+# All
+## import
+"""
+import nltk
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import os
+import torch
+import numpy as np
+from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification
+import json
+from openai import OpenAI
+import pandas as pd
+from huggingface_hub import snapshot_download
+import stanza
+import re
+import json
+import gradio as gr
+API_KEY = os.getenv("DEEPSEEK_API_KEY")
+BASE_URL = "https://api.deepseek.com"
+client = OpenAI(
+    api_key=API_KEY,
+    base_url=BASE_URL
+)
+FINAL_FRAMES_PATH = "final_frames.xlsx"
+TRIPLES_PATH = "frame_triples2.xlsx"
+FRAME_DET_REPO = "PooryaPiroozfar/frame-detection-parsbert"
+FE_REPO = "PooryaPiroozfar/srl-frame-elements-parsbert"
+FRAME_DET_DIR = "models/frame_detection"
+FE_BASE_DIR = "models/frame_elements"
+# -------------------------
+# دانلود مدل‌ها (یک‌بار)
+# -------------------------
+if not os.path.exists(FRAME_DET_DIR):
+    snapshot_download(repo_id=FRAME_DET_REPO, local_dir=FRAME_DET_DIR)
+if not os.path.exists(FE_BASE_DIR):
+    snapshot_download(repo_id=FE_REPO, local_dir=FE_BASE_DIR)
+frames_df = pd.read_excel(FINAL_FRAMES_PATH)
+triples_df = pd.read_excel(TRIPLES_PATH)
+from nltk.tokenize import sent_tokenize
+def split_sentences(text):
+    return sent_tokenize(text)
+"""## Models"""
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+embedding_dim = 768
+# save_dir = '/content/drive/MyDrive/SRLFrameDetection'
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# لیست فریم‌ها
+frame_names = [
+    "Activity_finish","Activity_start","Aging","Attaching","Attempt",
+    "Becoming","Being_born","Borrowing","Causation","Chatting",
+    "Choosing","Closure","Clothing","Cutting","Damaging","Desiring","Discussion",
+    "Emphasizing","Food","Installing","Locating","Memory","Morality_evaluation",
+    "Motion","Offering","Practice","Project","Publishing","Religious_belief",
+    "Removing","Request","Residence","Sharing","Taking","Telling","Travel",
+    "Using","Visiting","Waiting","Work"
+]
+# -------------------------
+# Encoder (ParsBERT)
+# -------------------------
+encoder_name = "HooshvareLab/bert-base-parsbert-uncased"
+sent_tokenizer = AutoTokenizer.from_pretrained(encoder_name)
+sent_encoder = AutoModel.from_pretrained(encoder_name).to(device)
+sent_encoder.eval()
+def get_embedding(text):
+    inputs = sent_tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=128
+    ).to(device)
+    with torch.no_grad():
+        outputs = sent_encoder(**inputs)
+    token_embeddings = outputs.last_hidden_state
+    mask = inputs["attention_mask"].unsqueeze(-1).expand(token_embeddings.size()).float()
+    summed = torch.sum(token_embeddings * mask, dim=1)
+    lengths = torch.clamp(mask.sum(dim=1), min=1e-9)
+    return (summed / lengths).squeeze(0)
+# -------------------------
+# مدل تشخیص فریم
+# -------------------------
+class FrameSimilarityModel(nn.Module):
+    def __init__(self, emb_dim, num_frames, frame_emb_init):
+        super().__init__()
+        self.proj = nn.Linear(emb_dim, emb_dim)
+        self.frame_embeddings = nn.Parameter(
+            torch.tensor(frame_emb_init, dtype=torch.float32)
+        )
+    def forward(self, sent_emb):
+        sent_proj = F.normalize(self.proj(sent_emb), dim=-1)
+        frames = F.normalize(self.frame_embeddings, dim=-1)
+        return torch.matmul(sent_proj, frames.T)
+frame_embs = np.load(os.path.join(FRAME_DET_DIR, "trained_frame_embeddings.npy"))
+frame_model = FrameSimilarityModel(
+    emb_dim=768,
+    num_frames=frame_embs.shape[0],
+    frame_emb_init=frame_embs
+).to(device)
+state_dict = torch.load(
+    os.path.join(FRAME_DET_DIR, "best_frame_margin_model.pt"),
+    map_location="cpu"
+)
+frame_model.load_state_dict(state_dict)
+frame_model.eval()
+THRESHOLD = 0.1   # می‌توانید تنظیم کنید
+def predict_frame(sentence):
+    emb = get_embedding(sentence).unsqueeze(0)
+    with torch.no_grad():
+        sims = frame_model(emb)
+        max_sim, idx = torch.max(sims, dim=1)
+    if max_sim.item() < THRESHOLD:
+        return None, max_sim.item()
+    return frame_names[idx.item()], max_sim.item()
+# -------------------------
+# Frame Elements
+# -------------------------
+def predict_frame_elements(sentence, frame_name):
+    frame_dir = os.path.join(FE_BASE_DIR, frame_name)
+    if not os.path.exists(frame_dir):
+        return []
+    with open(os.path.join(frame_dir, "label2id.json"), encoding="utf-8") as f:
+        label2id = json.load(f)
+    id2label = {int(v): k for k, v in label2id.items()}
+    tokenizer = AutoTokenizer.from_pretrained(frame_dir)
+    model = AutoModelForTokenClassification.from_pretrained(
+        frame_dir,
+        num_labels=len(label2id),
+        id2label=id2label,
+        label2id=label2id
+    ).to(device)
+    model.eval()
+    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=128)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    preds = torch.argmax(outputs.logits, dim=-1).squeeze(0).numpy()
+    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze(0))
+    elements = []
+    for tok, lab_id in zip(tokens, preds):
+        if tok in {"[CLS]", "[SEP]", "[PAD]"}:
+            continue
+        label = id2label[lab_id]
+        if label != "O":
+            elements.append((tok, label))
+    return elements
+CONDITIONAL_MARKERS = ["اگر", "در صورتی که", "چنانچه", "به شرطی که"]
+def split_conditional(sentence):
+    sentence = sentence.strip()
+    # بررسی وجود ویرگول فارسی یا انگلیسی
+    if "،" in sentence:
+        parts = sentence.split("،", 1)
+    elif "," in sentence:
+        parts = sentence.split(",", 1)
+    else:
+        return False, None, sentence
+    first, second = parts[0].strip(), parts[1].strip()
+    for marker in CONDITIONAL_MARKERS:
+        if first.startswith(marker):
+            return True, first, second
+    return False, None, sentence
+"""## POS"""
+nlp_fa = stanza.Pipeline(
+    lang='fa',
+    processors='tokenize,pos,lemma,depparse',
+    use_gpu=False
+)
+def sentence_has_subject_stanza(sentence):
+    doc = nlp_fa(sentence)
+    for sent in doc.sentences:
+        for word in sent.words:
+            if word.deprel == "nsubj":
+                return True
+    return False
+def extract_subject_from_condition(cond_srl, triples_df):
+    frame = cond_srl["frame"]
+    frame_elements = cond_srl["frame_elements"]
+    rows = triples_df[triples_df["Frame"] == frame]
+    subject_fes = list(rows["Subject"].unique())
+    for fe in subject_fes:
+        if fe in frame_elements and frame_elements[fe]:
+            return frame_elements[fe]
+    return None
+"""## build_srl_prompt"""
+def build_srl_prompt(sentence):
+    return f"""
+You are an expert in Persian Semantic Role Labeling.
+Frames definition (from FrameNet-style resource):
+{frames_df[['Frame','FE_list','lexical_units_fa','lexical_units_en']].to_string(index=False)}
+Task:
+For the Persian sentence below:
+1. Predict the most appropriate Frame.
+2. Extract Frame Elements (FE) as spans of text.
+3. Return output strictly in JSON.
+Sentence:
+"{sentence}"
+Output format:
+{{
+  "frame": "...",
+  "frame_elements": {{
+      "Agent": "...",
+      "Theme": "...",
+      "Time": "...",
+      ...
+  }}
+}}
+"""
+def deepseek_srl(sentence):
+    prompt = build_srl_prompt(sentence)
+    response = client.chat.completions.create(
+        model="deepseek-chat",
+        messages=[
+            {"role": "system", "content": "You perform Persian SRL."},
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0
+    )
+    return response.choices[0].message.content
+"""## extract_triples"""
+def extract_triples(frame, frame_elements):
+    rows = triples_df[triples_df["Frame"] == frame]
+    triples = []
+    for _, r in rows.iterrows():
+        subj = r["Subject"]
+        obj = r["Object"]
+        if subj in frame_elements and obj in frame_elements:
+            triples.append({
+                "subject": frame_elements[subj],
+                "relation": r["Relation"],
+                "object": frame_elements[obj],
+                "subject_fe": subj,
+                "object_fe": obj
+            })
+    return triples
+"""## extract_spin_rule"""
+def extract_spin_rule_v2(condition_result, consequence_result):
+  # استخراج قانون به ۳ زبان
+    prompt = f"""
+You are an expert in Semantic Web, SPIN rules, and formal logic.
+Condition SRL result:
+{condition_result}
+Consequence SRL result:
+{consequence_result}
+Task:
+1. Generate a SPIN rule in Turtle syntax that represents:
+   IF condition holds THEN consequence holds.
+2. Explain this rule in clear natural Persian.
+3. Express the rule in formal Persian logical form using universal quantification.
+   Use structure like:
+   "برای هر x، اگر ... آنگاه ..."
+Return output strictly in JSON format:
+{{
+  "spin_turtle": "...",
+  "persian_explanation": "...",
+  "formal_logic_fa": "..."
+}}
+"""
+    response = client.chat.completions.create(
+        model="deepseek-chat",
+        messages=[
+            {"role": "system", "content": "You generate SPIN rules and formal Persian logic."},
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0
+    )
+    return response.choices[0].message.content
+"""## حذف None"""
+def clean_frame_elements(frame_elements):
+    return {
+        fe: val
+        for fe, val in frame_elements.items()
+        if val not in (None, "", "None")
+    }
+# def extract_triples_safe(frame, frame_elements):
+#     rows = triples_df[triples_df["Frame"] == frame]
+#     triples = []
+#     for _, r in rows.iterrows():
+#         subj = r["Subject"]
+#         obj = r["Object"]
+#         if subj in frame_elements and obj in frame_elements:
+#             s_val = frame_elements[subj]
+#             o_val = frame_elements[obj]
+#             if s_val and o_val:
+#                 triples.append({
+#                     "subject": s_val,
+#                     "relation": r["Relation"],
+#                     "object": o_val,
+#                     "subject_fe": subj,
+#                     "object_fe": obj
+#                 })
+#     return triples
+"""## analyze_text"""
+def safe_json_loads(text):
+    if not text:
+        return None
+    # حذف ```json ... ```
+    text = text.strip()
+    text = re.sub(r"^```json", "", text)
+    text = re.sub(r"^```", "", text)
+    text = re.sub(r"```$", "", text)
+    # استخراج اولین { ... }
+    match = re.search(r"\{.*\}", text, re.DOTALL)
+    if match:
+        json_text = match.group(0)
+        return json.loads(json_text)
+    return None
+SPECIAL_DEEPSEEK_FRAMES = [
+    "Attempt","Becoming","Being_born","Causation","Chatting","Closure",
+    "Clothing","Desiring","Discussion","Emphasizing","Food","Memory",
+    "Morality_evaluation","Motion","Offering","Practice","Project",
+    "Religious_belief","Removing","Request","Sharing","Telling",
+    "Visiting","Work","Waiting"
+]
+def analyze_text_v5(text):
+    results = []
+    sentences = split_sentences(text)
+    for sent in sentences:
+        is_cond, cond, cons = split_conditional(sent)
+        if is_cond:
+            # ---------- تشخیص فریم جمله شرط ----------
+            frame_cond, sim_cond = predict_frame(cond)
+            frame_method_cond = "trained_model"
+            use_deepseek_cond = False
+            if frame_cond is None or sim_cond < 0.4 or frame_cond in SPECIAL_DEEPSEEK_FRAMES:
+                use_deepseek_cond = True
+                frame_method_cond = "LLM"
+            # ---------- SRL جمله شرط ----------
+            if use_deepseek_cond:
+                cond_srl = safe_json_loads(deepseek_srl(cond))
+                cond_srl["frame_method"] = "LLM"
+                cond_srl["fe_method"] = "LLM"
+            else:
+                elements = predict_frame_elements(cond, frame_cond)
+                fe_method = "trained_model"
+                cond_srl = {
+                    "frame": frame_cond,
+                    "frame_elements": {label: token for token, label in elements},
+                    "frame_method": "trained_model",
+                    "fe_method": fe_method
+                }
+            cond_srl["frame_elements"] = clean_frame_elements(cond_srl["frame_elements"])
+            cond_srl["frame_similarity"] = sim_cond
+            # ---------- بررسی فاعل در جمله دوم ----------
+            has_subject = sentence_has_subject_stanza(cons)
+            if not has_subject:
+                subject = extract_subject_from_condition(cond_srl, triples_df)
+                if subject:
+                    cons = subject + " " + cons
+            # ---------- تشخیص فریم جمله دوم ----------
+            frame_cons, sim_cons = predict_frame(cons)
+            frame_method_cons = "trained_model"
+            use_deepseek_cons = False
+            if frame_cons is None or sim_cons < 0.4 or frame_cons in SPECIAL_DEEPSEEK_FRAMES:
+                use_deepseek_cons = True
+                frame_method_cons = "LLM"
+            # ---------- SRL جمله دوم ----------
+            if use_deepseek_cons:
+                cons_srl = safe_json_loads(deepseek_srl(cons))
+                cons_srl["frame_method"] = "LLM"
+                cons_srl["fe_method"] = "LLM"
+            else:
+                elements = predict_frame_elements(cons, frame_cons)
+                fe_method = "trained_model"
+                cons_srl = {
+                    "frame": frame_cons,
+                    "frame_elements": {label: token for token, label in elements},
+                    "frame_method": "trained_model",
+                    "fe_method": fe_method
+                }
+            cons_srl["frame_elements"] = clean_frame_elements(cons_srl["frame_elements"])
+            cons_srl["frame_similarity"] = sim_cons
+            # ---------- استخراج triple ----------
+            cond_triples = extract_triples(cond_srl["frame"], cond_srl["frame_elements"])
+            cons_triples = extract_triples(cons_srl["frame"], cons_srl["frame_elements"])
+            # ---------- استخراج SPIN rule ----------
+            raw_spin = extract_spin_rule_v2(cond_srl, cons_srl)
+            spin_output = safe_json_loads(raw_spin)
+            if not spin_output:
+                spin_output = {
+                    "spin_turtle": None,
+                    "persian_explanation": None,
+                    "formal_logic_fa": None
+                }
+            results.append({
+                "type": "conditional",
+                "condition": {
+                    "sentence": cond,
+                    "srl": cond_srl,
+                    "triples": cond_triples
+                },
+                "consequence": {
+                    "sentence": cons,
+                    "srl": cons_srl,
+                    "triples": cons_triples
+                },
+                "spin_rule": spin_output["spin_turtle"],
+                "spin_explanation_fa": spin_output["persian_explanation"],
+                "formal_logic_fa": spin_output["formal_logic_fa"]
+            })
+        else:
+            # ---------- جمله ساده ----------
+            frame_name, sim = predict_frame(sent)
+            frame_method = "trained_model"
+            use_deepseek = False
+            if frame_name is None or sim < 0.4 or frame_name in SPECIAL_DEEPSEEK_FRAMES:
+                use_deepseek = True
+                frame_method = "LLM"
+            if use_deepseek:
+                srl = safe_json_loads(deepseek_srl(sent))
+                srl["frame_method"] = "LLM"
+                srl["fe_method"] = "LLM"
+            else:
+                elements = predict_frame_elements(sent, frame_name)
+                fe_method = "trained_model"
+                srl = {
+                    "frame": frame_name,
+                    "frame_elements": {label: token for token, label in elements},
+                    "frame_method": frame_method,
+                    "fe_method": fe_method
+                }
+            srl["frame_elements"] = clean_frame_elements(srl["frame_elements"])
+            srl["frame_similarity"] = sim
+            triples = extract_triples(srl["frame"], srl["frame_elements"])
+            results.append({
+                "type": "simple",
+                "sentence": sent,
+                "srl": srl,
+                "frame_similarity": sim,
+                "triples": triples
+            })
+    return results
+# -------------------------
+# Gradio UI
+# -------------------------
+def ui(sentence):
+    return analyze_text_v5(sentence)
+demo = gr.Interface(
+    fn=ui,
+    inputs=gr.Textbox(
+        label="جمله فارسی",
+        placeholder="مثال: اگر علی با قطار به مشهد سفر کند، با استاندار مشهد گپ می زند."
+    ),
+    outputs=gr.JSON(label="خروجی"),
+    title="Persian_Semantic_Information_Extraction",
+)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

final_frames.xlsx ADDED Viewed

Binary file (18.8 kB). View file

frame_triples2.xlsx ADDED Viewed

Binary file (20.8 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch
+transformers
+sentencepiece
+pandas
+numpy
+openpyxl
+gradio
+huggingface_hub
+nltk
+tqdm
+stanza