| | from transformers import pipeline |
| | import gradio as gr |
| | import docx |
| | import pandas as pd |
| |
|
| | |
| | model_name = "johnyyhk/bert-finetuned-ner-chinese-people-daily" |
| | get_completion = pipeline("ner", model=model_name) |
| |
|
| | |
| | def merge_tokens(tokens): |
| | merged_tokens = [] |
| | for token in tokens: |
| | if merged_tokens and token['entity'].startswith('I-') and merged_tokens[-1]['entity'].endswith(token['entity'][2:]): |
| | |
| | last_token = merged_tokens[-1] |
| | last_token['word'] += token['word'].replace('##', '') |
| | last_token['end'] = token['end'] |
| | last_token['score'] = (last_token['score'] + token['score']) / 2 |
| | else: |
| | |
| | merged_tokens.append(token) |
| | return merged_tokens |
| |
|
| | |
| | def extract_person_names(tokens): |
| | names = [] |
| | current_name = "" |
| | for token in tokens: |
| | if token['entity'] == 'B-PER': |
| | if current_name: |
| | names.append(current_name) |
| | current_name = token['word'] |
| | elif token['entity'] == 'I-PER' and current_name: |
| | current_name += token['word'] |
| | else: |
| | if current_name: |
| | names.append(current_name) |
| | current_name = "" |
| | if current_name: |
| | names.append(current_name) |
| | return list(set(names)) |
| |
|
| | |
| | def process_docx(file_path): |
| | doc = docx.Document(file_path) |
| | paragraphs = [] |
| | for p in doc.paragraphs: |
| | text = p.text.strip() |
| | if text: |
| | |
| | sub_paragraphs = text.split("\n") |
| | paragraphs.extend([sub_p.strip() for sub_p in sub_paragraphs if sub_p.strip()]) |
| | return paragraphs |
| |
|
| | |
| | def create_paragraph_blocks(paragraphs, block_size=4): |
| | return ["\n".join(paragraphs[i:i + block_size]) for i in range(0, len(paragraphs), block_size)] |
| |
|
| | |
| | def process_ner(file): |
| | paragraphs = process_docx(file.name) |
| | paragraph_blocks = create_paragraph_blocks(paragraphs) |
| |
|
| | all_names = [] |
| | for block in paragraph_blocks: |
| | tokens = get_completion(block) |
| | merged_tokens = merge_tokens(tokens) |
| | names = extract_person_names(merged_tokens) |
| | all_names.extend(names) |
| |
|
| | all_names = list(set(all_names)) |
| |
|
| | |
| | df = pd.DataFrame({'Person Names': all_names}) |
| | output_path = "ner_output.xlsx" |
| | df.to_excel(output_path, index=False) |
| |
|
| | return output_path |
| |
|
| | |
| | def ner_interface(file): |
| | output_path = process_ner(file) |
| | return f"NER completado. Archivo guardado en: {output_path}", output_path |
| |
|
| | demo = gr.Interface( |
| | fn=ner_interface, |
| | inputs=gr.File(label="Sube un archivo DOCX"), |
| | outputs=[gr.Textbox(label="Resultado"), gr.File(label="Descargar archivo")], |
| | title="NER de Nombres de Personas", |
| | description="Extrae nombres de personas desde un archivo DOCX usando NER y guarda los resultados en un archivo Excel.", |
| | allow_flagging="never" |
| | ) |
| |
|
| | demo.launch(inline=False) |