Spaces:

leilaghomashchi
/

Benchmark-data-anonymization

Sleeping

App Files Files Community

leilaghomashchi commited on Sep 23, 2025

Commit

ddd1a1c

verified ·

1 Parent(s): d838525

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -10

app.py CHANGED Viewed

@@ -15,19 +15,46 @@ class AnonymizationEvaluator:
         self.results_df = None
     def extract_entities_from_text(self, text: str) -> Dict[str, Set[str]]:
-        """استخراج موجودیت‌ها از متن"""
         if pd.isna(text) or not isinstance(text, str):
             return {'companies': set(), 'persons': set(), 'amounts': set(), 'percents': set(), 'groups': set()}
-        entities = {
-            'companies': set(re.findall(r'company-(\d+)', text)),
-            'persons': set(re.findall(r'person-(\d+)', text)),
-            'amounts': set(re.findall(r'amount-(\d+)', text)),
-            'percents': set(re.findall(r'percent-(\d+)', text)),
-            'groups': set(re.findall(r'group-(\d+)', text))
         }
         return entities
     def calculate_precision_recall_f1(self, reference_entities: Dict[str, Set[str]],
                                     predicted_entities: Dict[str, Set[str]]) -> Tuple[float, float, float]:
         """محاسبه Precision, Recall و F1-Score"""
@@ -101,7 +128,7 @@ class AnonymizationEvaluator:
             return 0.0, 0.0, 0.0
     def evaluate_dataset(self, file_path: str) -> Tuple[bool, str, pd.DataFrame]:
-        """ارزیابی کل دیتاست"""
         try:
             # بارگذاری فایل
             df = pd.read_csv(file_path)
@@ -113,11 +140,27 @@ class AnonymizationEvaluator:
             if missing_columns:
                 return False, f"ستون‌های مفقود: {', '.join(missing_columns)}", pd.DataFrame()
             # محاسبه متریک‌ها برای هر سطر
             precisions = []
             recalls = []
             f1_scores = []
             for index, row in df.iterrows():
                 precision, recall, f1 = self.evaluate_single_row(
                     row['Reference_text'],
@@ -127,6 +170,12 @@ class AnonymizationEvaluator:
                 precisions.append(round(precision, 4))
                 recalls.append(round(recall, 4))
                 f1_scores.append(round(f1, 4))
             # اضافه کردن ستون‌های جدید
             df['Precision'] = precisions
@@ -136,7 +185,12 @@ class AnonymizationEvaluator:
             # ذخیره نتایج
             self.results_df = df
-            return True, "ارزیابی با موفقیت انجام شد", df
         except Exception as e:
             return False, f"خطا در پردازش فایل: {str(e)}", pd.DataFrame()

         self.results_df = None
     def extract_entities_from_text(self, text: str) -> Dict[str, Set[str]]:
+        """استخراج موجودیت‌ها از متن با debugging"""
         if pd.isna(text) or not isinstance(text, str):
             return {'companies': set(), 'persons': set(), 'amounts': set(), 'percents': set(), 'groups': set()}
+        # تمیز کردن متن
+        text = str(text).strip()
+        # الگوهای مختلف برای موجودیت‌ها
+        patterns = {
+            'companies': [r'company-(\d+)', r'Company-(\d+)', r'COMPANY-(\d+)'],
+            'persons': [r'person-(\d+)', r'Person-(\d+)', r'PERSON-(\d+)'],
+            'amounts': [r'amount-(\d+)', r'Amount-(\d+)', r'AMOUNT-(\d+)'],
+            'percents': [r'percent-(\d+)', r'Percent-(\d+)', r'PERCENT-(\d+)'],
+            'groups': [r'group-(\d+)', r'Group-(\d+)', r'GROUP-(\d+)']
         }
+        entities = {}
+        for entity_type, pattern_list in patterns.items():
+            found = set()
+            for pattern in pattern_list:
+                matches = re.findall(pattern, text)
+                found.update(matches)
+            entities[entity_type] = found
         return entities
+    def debug_text_analysis(self, reference_text: str, predicted_text: str, row_num: int = 0) -> str:
+        """تابع debugging برای تحلیل متن‌ها"""
+        debug_info = f"\n--- Debug Row {row_num + 1} ---\n"
+        debug_info += f"Reference: '{reference_text[:100]}...'\n"
+        debug_info += f"Predicted: '{predicted_text[:100]}...'\n"
+        ref_entities = self.extract_entities_from_text(reference_text)
+        pred_entities = self.extract_entities_from_text(predicted_text)
+        debug_info += f"Reference entities: {dict(ref_entities)}\n"
+        debug_info += f"Predicted entities: {dict(pred_entities)}\n"
+        return debug_info
     def calculate_precision_recall_f1(self, reference_entities: Dict[str, Set[str]],
                                     predicted_entities: Dict[str, Set[str]]) -> Tuple[float, float, float]:
         """محاسبه Precision, Recall و F1-Score"""
             return 0.0, 0.0, 0.0
     def evaluate_dataset(self, file_path: str) -> Tuple[bool, str, pd.DataFrame]:
+        """ارزیابی کل دیتاست با debugging"""
         try:
             # بارگذاری فایل
             df = pd.read_csv(file_path)
             if missing_columns:
                 return False, f"ستون‌های مفقود: {', '.join(missing_columns)}", pd.DataFrame()
+            # تشخیص مشکل - بررسی نمونه‌ای از داده‌ها
+            debug_info = "\n=== Debug Information ===\n"
+            debug_info += f"تعداد سطرها: {len(df)}\n"
+            debug_info += f"ستون‌ها: {list(df.columns)}\n\n"
+            # بررسی چند سطر اول
+            for i in range(min(3, len(df))):
+                ref_text = str(df.iloc[i]['Reference_text'])
+                anon_text = str(df.iloc[i]['anonymized_text'])
+                debug_info += self.debug_text_analysis(ref_text, anon_text, i)
+            print(debug_info)  # نمایش در console
             # محاسبه متریک‌ها برای هر سطر
             precisions = []
             recalls = []
             f1_scores = []
+            total_entities_found = 0  # شمارنده کل موجودیت‌های یافت شده
             for index, row in df.iterrows():
                 precision, recall, f1 = self.evaluate_single_row(
                     row['Reference_text'],
                 precisions.append(round(precision, 4))
                 recalls.append(round(recall, 4))
                 f1_scores.append(round(f1, 4))
+                # شمارش موجودیت‌ها برای debugging
+                ref_entities = self.extract_entities_from_text(str(row['Reference_text']))
+                pred_entities = self.extract_entities_from_text(str(row['anonymized_text']))
+                total_entities_found += sum(len(entities) for entities in ref_entities.values())
+                total_entities_found += sum(len(entities) for entities in pred_entities.values())
             # اضافه کردن ستون‌های جدید
             df['Precision'] = precisions
             # ذخیره نتایج
             self.results_df = df
+            # پیام وضعیت شامل اطلاعات debugging
+            status_message = f"ارزیابی انجام شد. کل موجودیت‌های یافت شده: {total_entities_found}"
+            if total_entities_found == 0:
+                status_message += "\n⚠️ هیچ موجودیتی تشخیص داده نشد! لطفاً فرمت داده‌ها را بررسی کنید."
+            return True, status_message, df
         except Exception as e:
             return False, f"خطا در پردازش فایل: {str(e)}", pd.DataFrame()