Spaces:
Build error
Build error
File size: 4,240 Bytes
8d6f77a 0c65912 8d6f77a 70dec30 756fc41 8d6f77a 756fc41 0c65912 756fc41 0c65912 756fc41 da5079c 0c65912 da5079c 8d6f77a da5079c 8d6f77a da5079c 8d6f77a 756fc41 0c65912 8d6f77a 756fc41 da5079c 0c65912 70dec30 da5079c 8d6f77a 0c65912 70dec30 da5079c 70dec30 0c65912 70dec30 0c65912 da5079c 0c65912 ca3769d 0c65912 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import os
import glob
import json
import csv
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import zipfile
import xml.etree.ElementTree as ET
DATA_DIR = "/app/dados"
EXTRACT_DIR = "/app/dados_extraidos"
def setup_data():
os.makedirs(EXTRACT_DIR, exist_ok=True)
zip_files = glob.glob(DATA_DIR + "/**/*.zip", recursive=True)
if not zip_files:
print("Nenhum arquivo .zip encontrado, usando o diretório de dados principal.")
return DATA_DIR
for zip_path in zip_files:
print(f"Descompactando {zip_path}...")
with zipfile.ZipFile(zip_path, 'r') as zf:
zf.extractall(EXTRACT_DIR)
return EXTRACT_DIR
def xml_to_dict(element):
d = {}
for child in element:
child_dict = xml_to_dict(child)
if child.tag in d:
if not isinstance(d[child.tag], list):
d[child.tag] = [d[child.tag]]
d[child.tag].append(child_dict)
else:
d[child.tag] = child_dict
if not d:
return element.text
return d
def serialize_item_to_text(item_dict):
parts = []
if not isinstance(item_dict, dict):
return str(item_dict)
for key, value in item_dict.items():
if isinstance(value, dict):
nested_text = serialize_item_to_text(value)
parts.append(f"{key} ({nested_text})")
elif isinstance(value, list):
list_str = ', '.join([serialize_item_to_text(i) for i in value])
parts.append(f"{key}: [{list_str}]")
else:
parts.append(f"{key}: {value}")
return ", ".join(parts)
def main():
process_dir = setup_data()
csv.field_size_limit(10_000_000)
all_files = glob.glob(process_dir + "/**/*.json", recursive=True) + \
glob.glob(process_dir + "/**/*.csv", recursive=True) + \
glob.glob(process_dir + "/**/*.xml", recursive=True)
print(f"\n🔎 Encontrados {len(all_files)} arquivos (JSON, CSV, XML) para processar.")
if not all_files:
return
documents = []
for filepath in tqdm(all_files, desc="Processando arquivos"):
try:
if filepath.endswith('.json'):
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
for item in data: documents.append(serialize_item_to_text(item))
else:
documents.append(serialize_item_to_text(data))
elif filepath.endswith('.csv'):
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader: documents.append(serialize_item_to_text(row))
elif filepath.endswith('.xml'):
tree = ET.parse(filepath)
root = tree.getroot()
xml_dict = {root.tag: xml_to_dict(root)}
documents.append(serialize_item_to_text(xml_dict))
except Exception as e:
print(f"⚠️ Erro ao processar o arquivo {filepath}: {e}")
print(f"\nProcessamento de arquivos concluído! {len(documents)} documentos foram criados.")
if not documents:
return
cache_path = os.environ.get('SENTENCE_TRANSFORMERS_HOME', '/app/cache/torch')
print("Carregando modelo de alta performance: intfloat/multilingual-e5-large")
model = SentenceTransformer(
'intfloat/multilingual-e5-large',
cache_folder=cache_path
)
batch_size = 128
output_filename = '/app/output/meus_embeddings_e5_large.npy'
if os.path.exists(output_filename):
os.remove(output_filename)
print(f"🚀 Iniciando geração de embeddings (lotes de {batch_size}).")
for i in tqdm(range(0, len(documents), batch_size), desc="Gerando Embeddings"):
batch = documents[i:i + batch_size]
batch_embeddings = model.encode(batch, show_progress_bar=False)
with open(output_filename, 'ab') as f_out:
np.save(f_out, batch_embeddings)
print(f"✅ Processo finalizado! Embeddings salvos em '{output_filename}'.")
if __name__ == "__main__":
main() |