Spaces:
No application file
No application file
| import json | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| import nltk | |
| nltk.download('punkt') | |
| from nltk.tokenize import word_tokenize | |
| # Function to extract names using MAREFA NER model | |
| def extract_arabic_names(json_data, model, tokenizer): | |
| arabic_names = set() | |
| for entry in json_data: | |
| if "Arabic Text" in entry: | |
| text = entry["Arabic Text"] | |
| tokenized_text = tokenizer.tokenize(text) | |
| inputs = tokenizer(text, return_tensors="pt") | |
| outputs = model(**inputs) | |
| predictions = outputs.logits.argmax(dim=-1) | |
| predicted_labels = [model.config.id2label[label_id] for label_id in predictions[0]] | |
| current_name = "" | |
| for token, label in zip(tokenized_text, predicted_labels): | |
| if label == "B-person": | |
| current_name = token | |
| elif label == "I-person": | |
| current_name += " " + token | |
| elif label != "O" and current_name: | |
| arabic_names.add(current_name) | |
| current_name = "" | |
| if current_name: | |
| arabic_names.add(current_name) | |
| return arabic_names | |
| # Load the MAREFA NER model and tokenizer | |
| model_name = "marefa-nlp/marefa-ner" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForTokenClassification.from_pretrained(model_name) | |
| basic='cache/output/basic_info_frame.json' | |
| # Load JSON data from the file | |
| with open(basic, "r", encoding="utf-8") as file: | |
| json_data = json.load(file) | |
| # Extract names from the JSON data using MAREFA model | |
| arabic_names = extract_arabic_names(json_data, model, tokenizer) | |
| # Print the extracted names | |
| if arabic_names: | |
| print("Arabic names extracted:") | |
| for name in arabic_names: | |
| print("Name:", name) | |
| else: | |
| print("No Arabic names found.") |