Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pyarabic.araby as araby
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import re
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
from datasets import Features
|
| 7 |
+
from datasets import Value
|
| 8 |
+
from datasets import Dataset
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
Secret_token = os.getenv('HF_token')
|
| 12 |
+
|
| 13 |
+
dataset = load_dataset("FDSRashid/embed_matn", token = Secret_token)
|
| 14 |
+
books = load_dataset('FDSRashid/Hadith_info', data_files='Books.csv', token=Secret_token)['train'].to_pandas()
|
| 15 |
+
df = dataset["train"].to_pandas()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
dataset = load_dataset("FDSRashid/hadith_info", data_files = 'All_Matns.csv',token = Secret_token, features = features)
|
| 19 |
+
matn_info = dataset['train'].to_pandas()
|
| 20 |
+
matn_info = matn_info.drop(97550)
|
| 21 |
+
matn_info = matn_info.drop(307206)
|
| 22 |
+
matn_info['taraf_ID'] = matn_info['taraf_ID'].replace('KeyAbsent', -1)
|
| 23 |
+
matn_info['taraf_ID'] = matn_info['taraf_ID'].astype(int)
|
| 24 |
+
|
| 25 |
+
matn_info['Book ID'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[0]))
|
| 26 |
+
matn_info['Hadith Number'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[1]))
|
| 27 |
+
matn_info = matn_info.join(books, on='Book ID')
|
| 28 |
+
cols_to_use = df.columns.difference(matn_info.columns)
|
| 29 |
+
|
| 30 |
+
joined_df = matn_info.merge(df[cols_to_use], left_index=True, right_on='__index_level_0__')
|
| 31 |
+
df = joined_df.copy()
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|