Spaces:
Sleeping
Sleeping
Commit ·
b4b8d2a
1
Parent(s): 7e00fac
progress more 20..
Browse files
app.py
CHANGED
|
@@ -108,12 +108,14 @@ def fuzzy_deduplicate(df, column, threshold=65):
|
|
| 108 |
def process_file(uploaded_file):
|
| 109 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
| 110 |
|
|
|
|
|
|
|
| 111 |
# Apply fuzzy deduplication
|
| 112 |
df = df.groupby('Объект').apply(
|
| 113 |
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
| 114 |
).reset_index(drop=True)
|
| 115 |
|
| 116 |
-
|
| 117 |
remaining_news_count = len(df)
|
| 118 |
duplicates_removed = original_news_count - remaining_news_count
|
| 119 |
|
|
|
|
| 108 |
def process_file(uploaded_file):
|
| 109 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
| 110 |
|
| 111 |
+
original_news_count = len(df)
|
| 112 |
+
|
| 113 |
# Apply fuzzy deduplication
|
| 114 |
df = df.groupby('Объект').apply(
|
| 115 |
lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)
|
| 116 |
).reset_index(drop=True)
|
| 117 |
|
| 118 |
+
|
| 119 |
remaining_news_count = len(df)
|
| 120 |
duplicates_removed = original_news_count - remaining_news_count
|
| 121 |
|