Spaces:
Build error
Build error
| import os | |
| import glob | |
| import json | |
| import csv | |
| import numpy as np | |
| from tqdm.auto import tqdm | |
| from sentence_transformers import SentenceTransformer | |
| import zipfile | |
| import xml.etree.ElementTree as ET | |
| DATA_DIR = "/app/dados" | |
| EXTRACT_DIR = "/app/dados_extraidos" | |
| def setup_data(): | |
| os.makedirs(EXTRACT_DIR, exist_ok=True) | |
| zip_files = glob.glob(DATA_DIR + "/**/*.zip", recursive=True) | |
| if not zip_files: | |
| print("Nenhum arquivo .zip encontrado, usando o diretório de dados principal.") | |
| return DATA_DIR | |
| for zip_path in zip_files: | |
| print(f"Descompactando {zip_path}...") | |
| with zipfile.ZipFile(zip_path, 'r') as zf: | |
| zf.extractall(EXTRACT_DIR) | |
| return EXTRACT_DIR | |
| def xml_to_dict(element): | |
| d = {} | |
| for child in element: | |
| child_dict = xml_to_dict(child) | |
| if child.tag in d: | |
| if not isinstance(d[child.tag], list): | |
| d[child.tag] = [d[child.tag]] | |
| d[child.tag].append(child_dict) | |
| else: | |
| d[child.tag] = child_dict | |
| if not d: | |
| return element.text | |
| return d | |
| def serialize_item_to_text(item_dict): | |
| parts = [] | |
| if not isinstance(item_dict, dict): | |
| return str(item_dict) | |
| for key, value in item_dict.items(): | |
| if isinstance(value, dict): | |
| nested_text = serialize_item_to_text(value) | |
| parts.append(f"{key} ({nested_text})") | |
| elif isinstance(value, list): | |
| list_str = ', '.join([serialize_item_to_text(i) for i in value]) | |
| parts.append(f"{key}: [{list_str}]") | |
| else: | |
| parts.append(f"{key}: {value}") | |
| return ", ".join(parts) | |
| def main(): | |
| process_dir = setup_data() | |
| csv.field_size_limit(10_000_000) | |
| all_files = glob.glob(process_dir + "/**/*.json", recursive=True) + \ | |
| glob.glob(process_dir + "/**/*.csv", recursive=True) + \ | |
| glob.glob(process_dir + "/**/*.xml", recursive=True) | |
| print(f"\n🔎 Encontrados {len(all_files)} arquivos (JSON, CSV, XML) para processar.") | |
| if not all_files: | |
| return | |
| documents = [] | |
| for filepath in tqdm(all_files, desc="Processando arquivos"): | |
| try: | |
| if filepath.endswith('.json'): | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| if isinstance(data, list): | |
| for item in data: documents.append(serialize_item_to_text(item)) | |
| else: | |
| documents.append(serialize_item_to_text(data)) | |
| elif filepath.endswith('.csv'): | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: documents.append(serialize_item_to_text(row)) | |
| elif filepath.endswith('.xml'): | |
| tree = ET.parse(filepath) | |
| root = tree.getroot() | |
| xml_dict = {root.tag: xml_to_dict(root)} | |
| documents.append(serialize_item_to_text(xml_dict)) | |
| except Exception as e: | |
| print(f"⚠️ Erro ao processar o arquivo {filepath}: {e}") | |
| print(f"\nProcessamento de arquivos concluído! {len(documents)} documentos foram criados.") | |
| if not documents: | |
| return | |
| cache_path = os.environ.get('SENTENCE_TRANSFORMERS_HOME', '/app/cache/torch') | |
| print("Carregando modelo de alta performance: intfloat/multilingual-e5-large") | |
| model = SentenceTransformer( | |
| 'intfloat/multilingual-e5-large', | |
| cache_folder=cache_path | |
| ) | |
| batch_size = 128 | |
| output_filename = '/app/output/meus_embeddings_e5_large.npy' | |
| if os.path.exists(output_filename): | |
| os.remove(output_filename) | |
| print(f"🚀 Iniciando geração de embeddings (lotes de {batch_size}).") | |
| for i in tqdm(range(0, len(documents), batch_size), desc="Gerando Embeddings"): | |
| batch = documents[i:i + batch_size] | |
| batch_embeddings = model.encode(batch, show_progress_bar=False) | |
| with open(output_filename, 'ab') as f_out: | |
| np.save(f_out, batch_embeddings) | |
| print(f"✅ Processo finalizado! Embeddings salvos em '{output_filename}'.") | |
| if __name__ == "__main__": | |
| main() |