genipapo-parser / download_resources.py
Bryan Khelven
Initial deploy
ffdedc7
import os
import requests
import hashlib
import sys
def download_genipapo_model():
# Direct download URL from GitHub Releases
model_url = 'https://github.com/bryankhelven/genipapo/releases/download/Publishing/genipapo.pt'
model_dir = os.path.join('models')
model_path = os.path.join(model_dir, 'genipapo.pt')
if not os.path.exists(model_dir):
os.makedirs(model_dir)
if os.path.exists(model_path):
print("Genipapo model already exists. Verifying checksum...")
with open(model_path, 'rb') as f:
data = f.read()
checksum = hashlib.md5(data).hexdigest()
if checksum == model_checksum:
print("Checksum verified. Model is ready to use.")
return
else:
print("Checksum mismatch. Redownloading the model...")
os.remove(model_path)
print("Downloading Genipapo model...")
response = requests.get(model_url, stream=True)
if response.status_code != 200:
print("Failed to download the model. Please check the URL.")
sys.exit(1)
with open(model_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Download completed. Model is ready to use.")
# Diretório onde os recursos serão salvos
RESOURCE_DIR = "stanza_resources"
LANGUAGE = "pt"
# Mapear os componentes necessários com os URLs corrigidos
REQUIRED_COMPONENTS = {
"backward_charlm": "https://huggingface.co/stanfordnlp/stanza-pt/resolve/main/models/backward_charlm/oscar2023.pt",
"forward_charlm": "https://huggingface.co/stanfordnlp/stanza-pt/resolve/main/models/forward_charlm/oscar2023.pt",
"pretrain": "https://huggingface.co/stanfordnlp/stanza-pt/resolve/main/models/pretrain/conll17.pt",
}
# Função para baixar arquivos com progresso
def download_file(url, dest_path):
with requests.get(url, stream=True) as response:
response.raise_for_status()
with open(dest_path, "wb") as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
# Função para baixar recursos específicos
def download_specific_resources():
if not os.path.exists(RESOURCE_DIR):
os.makedirs(RESOURCE_DIR)
# Baixar o arquivo `resources.json`
resources_url = "https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json"
resources_path = os.path.join(RESOURCE_DIR, "resources.json")
print("Baixando resources.json...")
download_file(resources_url, resources_path)
# Caminho base para os recursos do idioma
lang_dir = os.path.join(RESOURCE_DIR, LANGUAGE)
if not os.path.exists(lang_dir):
os.makedirs(lang_dir)
# Baixar os componentes necessários
for component, url in REQUIRED_COMPONENTS.items():
component_dir = os.path.join(lang_dir, component)
os.makedirs(component_dir, exist_ok=True)
component_path = os.path.join(component_dir, "model.pt")
print(f"Baixando {component}...")
download_file(url, component_path)
print(f"{component} baixado para {component_path}")
print("Download concluído. Recursos disponíveis em:", RESOURCE_DIR)
if __name__ == '__main__':
download_genipapo_model()
download_specific_resources()