| | import gradio as gr |
| | import spacy |
| | import pandas as pd |
| | from docx import Document |
| | from io import BytesIO |
| | import tempfile |
| | import os |
| | import multiprocessing as mp |
| | import psutil |
| | import time |
| | from datetime import datetime |
| |
|
| | |
| | nlp = spacy.load('zh_core_web_trf') |
| |
|
| | def get_system_status(): |
| | cpu_usage = psutil.cpu_percent() |
| | memory = psutil.virtual_memory() |
| | return f"CPU: {cpu_usage}% | RAM: {memory.percent}% | 脷ltimo update: {datetime.now().strftime('%H:%M:%S')}" |
| |
|
| | def extract_names_from_text(text): |
| | print(f'{len(text)}/n/n') |
| | doc = nlp(text) |
| | persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON'] |
| | return persons |
| |
|
| | def split_text(text, max_length=100000): |
| | result = [] |
| | current_chunk = [] |
| | current_length = 0 |
| | paragraphs = text.split('\n') |
| |
|
| | for paragraph in paragraphs: |
| | paragraph_length = len(paragraph) + 1 |
| | if current_length + paragraph_length <= max_length: |
| | current_chunk.append(paragraph) |
| | current_length += paragraph_length |
| | else: |
| | result.append('\n'.join(current_chunk)) |
| | current_chunk = [paragraph] |
| | current_length = paragraph_length |
| |
|
| | if current_chunk: |
| | result.append('\n'.join(current_chunk)) |
| |
|
| | return result |
| |
|
| | def extract_names_from_fragments(fragments): |
| | with mp.Pool(processes=4) as pool: |
| | results = pool.map(extract_names_from_text, fragments) |
| | return results |
| |
|
| | def extract_names_from_docx(docx_file, progress=gr.Progress()): |
| | |
| | progress(0, desc="Iniciando procesamiento...") |
| | |
| | |
| | document = Document(docx_file) |
| | full_text = [] |
| | for para in document.paragraphs: |
| | full_text.append(para.text) |
| | |
| | progress(0.2, desc="Documento cargado, preparando texto...") |
| | |
| | |
| | text = ' '.join(full_text) |
| | |
| | |
| | text_fragments = split_text(text) |
| | progress(0.3, desc=f"Texto dividido en {len(text_fragments)} fragmentos...") |
| | |
| | |
| | all_persons = [] |
| | for i, fragment_results in enumerate(extract_names_from_fragments(text_fragments)): |
| | all_persons.extend(fragment_results) |
| | progress((0.3 + (0.5 * (i+1)/len(text_fragments))), |
| | desc=f"Procesando fragmento {i+1} de {len(text_fragments)}...") |
| | |
| | |
| | all_persons = list(set(all_persons)) |
| | progress(0.9, desc="Preparando resultados...") |
| | |
| | |
| | df = pd.DataFrame(all_persons, columns=['Nombres']) |
| | |
| | |
| | temp_dir = tempfile.mkdtemp() |
| | temp_file_path = os.path.join(temp_dir, "nombres_personas.xlsx") |
| | |
| | |
| | with pd.ExcelWriter(temp_file_path, engine='openpyxl') as writer: |
| | df.to_excel(writer, index=False) |
| | |
| | progress(1.0, desc="隆Procesamiento completado!") |
| | return temp_file_path |
| |
|
| | |
| | with gr.Blocks() as demo: |
| | gr.Markdown("# Extractor de Nombres") |
| | gr.Markdown("Sube un archivo .docx y extrae los nombres de las personas usando NLP con SpaCy.") |
| | |
| | |
| | system_status = gr.Textbox(label="Estado del Sistema", value="Inicializando...") |
| | |
| | |
| | file_input = gr.File(file_types=[".docx"]) |
| | output_file = gr.File(label="Archivo de resultados") |
| | |
| | |
| | process_btn = gr.Button("Procesar Documento") |
| | process_btn.click(fn=extract_names_from_docx, inputs=file_input, outputs=output_file) |
| | |
| | |
| | demo.load(get_system_status, None, system_status, every=5) |
| |
|
| | |
| | demo.launch() |