diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,11 +1,11 @@ -from flask import Flask, render_template_string, request, redirect, url_for, jsonify, flash, send_from_directory +from flask import Flask, render_template_string, request, redirect, url_for, jsonify, flash import json import os import logging import threading import time from datetime import datetime -from huggingface_hub import HfApi, hf_hub_download, list_repo_files +from huggingface_hub import HfApi, hf_hub_download, delete_file as hf_delete_file from huggingface_hub.utils import RepositoryNotFoundError, HfHubHTTPError, EntryNotFoundError from werkzeug.utils import secure_filename from dotenv import load_dotenv @@ -18,49 +18,44 @@ load_dotenv() app = Flask(__name__) app.secret_key = os.getenv("FLASK_SECRET_KEY", 'tontalent_secret_key_for_flash_messages_only') - DATA_FILE = 'tontalent_data.json' -UPLOADS_DIR_NAME = 'uploads' # Relative to project root -app.config['UPLOAD_FOLDER'] = UPLOADS_DIR_NAME -os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) - -MAX_IMAGE_FILES = 10 -ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'webp'} +SYNC_FILES = [DATA_FILE] # For DB file itself -SYNC_FILES = [DATA_FILE] # Main data file, images handled separately based on this file +# Configuration for image uploads +UPLOAD_FOLDER_NAME = 'uploads_temp' # Temporary local storage before HF upload +app.config['UPLOAD_FOLDER'] = os.path.join(app.instance_path, UPLOAD_FOLDER_NAME) +app.config['MAX_IMAGE_UPLOADS'] = 10 +app.config['ALLOWED_EXTENSIONS'] = {'png', 'jpg', 'jpeg', 'gif'} REPO_ID = os.getenv("HF_REPO_ID", "Kgshop/tontalent2") HF_TOKEN_WRITE = os.getenv("HF_TOKEN_WRITE") HF_TOKEN_READ = os.getenv("HF_TOKEN_READ") -TELEGRAM_BOT_TOKEN = "7549355625:AAGhdbf6x1JEzpH0mUtuxTF83Soi7MFVNZ8" # Replace with your actual bot token +TELEGRAM_BOT_TOKEN = "7549355625:AAGhdbf6x1JEzpH0mUtuxTF83Soi7MFVNZ8" # Replace with your actual bot token if needed DOWNLOAD_RETRIES = 3 DOWNLOAD_DELAY = 5 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -def allowed_file(filename): - return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS +# Ensure instance folder and temp upload folder exist +try: + os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) +except OSError as e: + logging.error(f"Error creating upload folder {app.config['UPLOAD_FOLDER']}: {e}") + -def _get_all_image_paths_from_data(data_dict): - image_paths = set() - for item_type_key in ['resumes', 'vacancies', 'freelance_offers']: - for item in data_dict.get(item_type_key, []): - for img_path in item.get('images', []): - if img_path.startswith(UPLOADS_DIR_NAME + "/"): # Ensure it's a path we manage - image_paths.add(img_path) - return list(image_paths) +def allowed_file(filename): + return '.' in filename and \ + filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS'] def download_db_from_hf(specific_file=None, retries=DOWNLOAD_RETRIES, delay=DOWNLOAD_DELAY): if not HF_TOKEN_READ and not HF_TOKEN_WRITE: logging.warning("HF_TOKEN_READ/HF_TOKEN_WRITE not set. Download might fail for private repos.") token_to_use = HF_TOKEN_READ if HF_TOKEN_READ else HF_TOKEN_WRITE - files_to_download = [specific_file] if specific_file else SYNC_FILES - logging.info(f"Attempting download for primary files {files_to_download} from {REPO_ID}...") + logging.info(f"Attempting download for {files_to_download} from {REPO_ID}...") all_successful = True - for file_name in files_to_download: success = False for attempt in range(retries + 1): @@ -75,11 +70,10 @@ def download_db_from_hf(specific_file=None, retries=DOWNLOAD_RETRIES, delay=DOWN success = True break except RepositoryNotFoundError: - logging.error(f"Repository {REPO_ID} not found. Download cancelled.") - return False - except (HfHubHTTPError, EntryNotFoundError) as e: - is_404 = isinstance(e, EntryNotFoundError) or (isinstance(e, HfHubHTTPError) and e.response.status_code == 404) - if is_404: + logging.error(f"Repository {REPO_ID} not found. Download cancelled for all files.") + return False + except HfHubHTTPError as e: + if e.response.status_code == 404: logging.warning(f"File {file_name} not found in repo {REPO_ID} (404). Skipping this file.") if attempt == 0 and not os.path.exists(file_name): try: @@ -87,169 +81,96 @@ def download_db_from_hf(specific_file=None, retries=DOWNLOAD_RETRIES, delay=DOWN with open(file_name, 'w', encoding='utf-8') as f: json.dump({'resumes': [], 'vacancies': [], 'freelance_offers': [], 'users': {}}, f) logging.info(f"Created empty local file {file_name} because it was not found on HF.") - success = True # Created locally, treat as success for this file except Exception as create_e: logging.error(f"Failed to create empty local file {file_name}: {create_e}") - break + success = True # Consider it successful if it's a 404 and we're okay with it (e.g. new repo) + break # Don't retry 404 else: logging.error(f"HTTP error downloading {file_name} (Attempt {attempt + 1}): {e}. Retrying in {delay}s...") except Exception as e: - logging.error(f"Unexpected error downloading {file_name} (Attempt {attempt + 1}): {e}. Retrying in {delay}s...", exc_info=True) + logging.error(f"Unexpected error downloading {file_name} (Attempt {attempt + 1}): {e}. Retrying in {delay}s...", exc_info=True) if attempt < retries: time.sleep(delay) if not success: logging.error(f"Failed to download {file_name} after {retries + 1} attempts.") all_successful = False - - if not all_successful: - logging.error("Not all primary files downloaded successfully. Image download might be skipped or incomplete.") - return False - - # Download images if DATA_FILE was part of the download or no specific file was requested (full sync) - if (specific_file is None or specific_file == DATA_FILE) and os.path.exists(DATA_FILE): - logging.info("Downloading referenced image files...") - try: - with open(DATA_FILE, 'r', encoding='utf-8') as f: - data_content = json.load(f) - image_paths_in_data = _get_all_image_paths_from_data(data_content) - - for img_repo_path in image_paths_in_data: - local_img_full_path = os.path.join(app.root_path, img_repo_path) - os.makedirs(os.path.dirname(local_img_full_path), exist_ok=True) - img_success = False - for attempt in range(retries + 1): - try: - logging.info(f"Downloading image {img_repo_path} (Attempt {attempt+1})") - hf_hub_download( - repo_id=REPO_ID, filename=img_repo_path, repo_type="dataset", - token=token_to_use, local_dir=".", local_dir_use_symlinks=False, # local_dir="." means files go into UPLOADS_DIR_NAME/ - force_download=True, resume_download=False - ) - logging.info(f"Successfully downloaded image {img_repo_path}.") - img_success = True - break - except (HfHubHTTPError, EntryNotFoundError) as e_img: - is_404_img = isinstance(e_img, EntryNotFoundError) or (isinstance(e_img, HfHubHTTPError) and e_img.response.status_code == 404) - if is_404_img: - logging.warning(f"Image {img_repo_path} not found on HF (404). Skipping.") - break - else: - logging.error(f"HTTP error downloading image {img_repo_path}: {e_img}. Retrying...") - except Exception as e_img: - logging.error(f"Unexpected error downloading image {img_repo_path}: {e_img}. Retrying...", exc_info=True) - if attempt < retries: time.sleep(delay) - if not img_success: - logging.error(f"Failed to download image {img_repo_path} after multiple attempts.") - all_successful = False # Mark overall sync as potentially incomplete - except Exception as e: - logging.error(f"Error processing or downloading images: {e}", exc_info=True) - all_successful = False - logging.info(f"Download process finished. Overall success: {all_successful}") return all_successful +def upload_file_to_hf_with_retry(local_path, path_in_repo, repo_id, token, commit_message, retries=3, delay=5): + api = HfApi() + for attempt in range(retries + 1): + try: + api.upload_file( + path_or_fileobj=local_path, + path_in_repo=path_in_repo, + repo_id=repo_id, + repo_type="dataset", + token=token, + commit_message=commit_message + ) + logging.info(f"File {local_path} successfully uploaded to {path_in_repo} in {repo_id}.") + return True + except Exception as e: + logging.error(f"Error uploading {local_path} to HF (Attempt {attempt + 1}): {e}") + if attempt < retries: + time.sleep(delay) + else: + logging.error(f"Failed to upload {local_path} to HF after {retries + 1} attempts.") + return False -def upload_db_to_hf(specific_file_local_path=None, specific_file_repo_path=None): +def upload_db_to_hf(specific_file=None): if not HF_TOKEN_WRITE: - logging.warning("HF_TOKEN_WRITE not set. Skipping upload to Hugging Face.") + logging.warning("HF_TOKEN_WRITE not set. Skipping upload of DB to Hugging Face.") return - try: - api = HfApi() - - files_to_upload_map = {} # {local_path: repo_path} - - if specific_file_local_path and specific_file_repo_path: - if os.path.exists(specific_file_local_path): - files_to_upload_map[specific_file_local_path] = specific_file_repo_path - else: - logging.warning(f"Specific file {specific_file_local_path} not found locally for upload.") - else: # Full sync - for file_name in SYNC_FILES: # DATA_FILE - if os.path.exists(file_name): - files_to_upload_map[file_name] = file_name - - # Add images referenced in DATA_FILE for full sync - if os.path.exists(DATA_FILE): - try: - with open(DATA_FILE, 'r', encoding='utf-8') as f: - data_content = json.load(f) - image_paths_in_data = _get_all_image_paths_from_data(data_content) - for img_repo_path in image_paths_in_data: # img_repo_path is like 'uploads/file.jpg' - local_img_path = os.path.join(app.root_path, img_repo_path) - if os.path.exists(local_img_path): - files_to_upload_map[local_img_path] = img_repo_path - else: - logging.warning(f"Image {local_img_path} (referenced in data) not found locally, skipping upload.") - except Exception as e: - logging.error(f"Error reading DATA_FILE for image paths during upload: {e}") - - if not files_to_upload_map: - logging.info("No files to upload.") - return - - logging.info(f"Starting upload of {len(files_to_upload_map)} file(s) to HF repo {REPO_ID}...") - for local_path, repo_path_in_repo in files_to_upload_map.items(): - try: - logging.info(f"Uploading {local_path} to {repo_path_in_repo}...") - api.upload_file( - path_or_fileobj=local_path, path_in_repo=repo_path_in_repo, repo_id=REPO_ID, - repo_type="dataset", token=HF_TOKEN_WRITE, - commit_message=f"Sync {os.path.basename(local_path)} {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + files_to_upload = [specific_file] if specific_file else SYNC_FILES + logging.info(f"Starting upload of {files_to_upload} to HF repo {REPO_ID}...") + for file_name in files_to_upload: + if os.path.exists(file_name): + upload_file_to_hf_with_retry( + local_path=file_name, + path_in_repo=file_name, + repo_id=REPO_ID, + token=HF_TOKEN_WRITE, + commit_message=f"Sync {file_name} {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" ) - logging.info(f"File {local_path} successfully uploaded to {repo_path_in_repo}.") - except Exception as e: - logging.error(f"Error uploading file {local_path} to Hugging Face: {e}") - logging.info("Finished uploading files to HF.") - + else: + logging.warning(f"File {file_name} not found locally, skipping upload.") + logging.info("Finished uploading DB files to HF.") except Exception as e: - logging.error(f"General error during Hugging Face upload initialization or process: {e}", exc_info=True) + logging.error(f"General error during Hugging Face DB upload initialization or process: {e}", exc_info=True) -def delete_files_from_hf(file_paths_in_repo): +def delete_image_from_hf(filename_in_repo_uploads_dir): if not HF_TOKEN_WRITE: - logging.warning("HF_TOKEN_WRITE not set. Skipping deletion from Hugging Face.") + logging.warning("HF_TOKEN_WRITE not set. Skipping image deletion from Hugging Face.") return False - if not file_paths_in_repo: - return True - - api = HfApi() try: - logging.info(f"Attempting to delete files from HF: {file_paths_in_repo}") - # HfApi().delete_files expects list of strings, not string - paths_to_delete = file_paths_in_repo if isinstance(file_paths_in_repo, list) else [file_paths_in_repo] - api.delete_files( + path_in_repo = f"uploads/{filename_in_repo_uploads_dir}" + logging.info(f"Attempting to delete {path_in_repo} from HF repo {REPO_ID}") + hf_delete_file( + path_in_repo=path_in_repo, repo_id=REPO_ID, - paths_in_repo=paths_to_delete, repo_type="dataset", token=HF_TOKEN_WRITE, - commit_message=f"Deleted files: {', '.join(paths_to_delete)}" + commit_message=f"Delete image {filename_in_repo_uploads_dir}" ) - logging.info(f"Successfully deleted files from HF: {paths_to_delete}") + logging.info(f"Successfully deleted {path_in_repo} from Hugging Face.") return True except EntryNotFoundError: - logging.warning(f"Some files not found on HF during deletion: {paths_to_delete}. Considered as success.") - return True # If not found, it's effectively deleted or was never there. + logging.warning(f"Image {path_in_repo} not found on Hugging Face. Could be already deleted.") + return True # Consider it success if not found except Exception as e: - logging.error(f"Error deleting files from Hugging Face: {e}", exc_info=True) + logging.error(f"Error deleting image {path_in_repo} from Hugging Face: {e}", exc_info=True) return False -def delete_local_files(local_file_paths): - for local_path in local_file_paths: - try: - if os.path.exists(local_path): - os.remove(local_path) - logging.info(f"Locally deleted {local_path}") - except Exception as e: - logging.error(f"Error deleting local file {local_path}: {e}") - - def periodic_backup(): backup_interval = 1800 logging.info(f"Setting up periodic backup every {backup_interval} seconds.") while True: time.sleep(backup_interval) logging.info("Starting periodic backup...") - upload_db_to_hf() + upload_db_to_hf() logging.info("Periodic backup finished.") def load_data(): @@ -260,14 +181,19 @@ def load_data(): logging.info(f"Local data loaded successfully from {DATA_FILE}") if not isinstance(data, dict): logging.warning(f"Local {DATA_FILE} is not a dictionary. Attempting download.") - raise FileNotFoundError + raise FileNotFoundError for key in default_data: if key not in data: data[key] = default_data[key] + for item_type in ['resumes', 'vacancies', 'freelance_offers']: # Ensure image_filenames field + if item_type in data: + for item in data[item_type]: + if 'image_filenames' not in item: + item['image_filenames'] = [] return data except (FileNotFoundError, json.JSONDecodeError) as e: logging.warning(f"Error loading local data ({e}). Attempting download from HF.") - if download_db_from_hf(specific_file=DATA_FILE): # This will also attempt to download images referenced in DATA_FILE + if download_db_from_hf(specific_file=DATA_FILE): try: with open(DATA_FILE, 'r', encoding='utf-8') as file: data = json.load(file) @@ -277,6 +203,11 @@ def load_data(): return default_data for key in default_data: if key not in data: data[key] = default_data[key] + for item_type in ['resumes', 'vacancies', 'freelance_offers']: + if item_type in data: + for item in data[item_type]: + if 'image_filenames' not in item: + item['image_filenames'] = [] return data except Exception as load_e: logging.error(f"Error loading downloaded {DATA_FILE}: {load_e}. Using default.", exc_info=True) @@ -299,18 +230,16 @@ def save_data(data): default_keys = {'resumes': [], 'vacancies': [], 'freelance_offers': [], 'users': {}} for key in default_keys: if key not in data: data[key] = default_keys[key] + for item_type in ['resumes', 'vacancies', 'freelance_offers']: # Ensure image_filenames field + if item_type in data: + for item in data[item_type]: + if 'image_filenames' not in item: + item['image_filenames'] = [] with open(DATA_FILE, 'w', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False, indent=4) logging.info(f"Data successfully saved to {DATA_FILE}") - upload_db_to_hf(specific_file_local_path=DATA_FILE, specific_file_repo_path=DATA_FILE) # Upload DATA_FILE first - # Then upload all images referenced in it. - # The general upload_db_to_hf() without args will handle both DATA_FILE and its referenced images. - # For fine-grained control, could call: upload_db_to_hf() here without args - # to ensure images are also synced after DATA_FILE is updated. - # Current `upload_db_to_hf` will try to upload all referenced images if no specific_file args are given. - # So for just saving data, it's enough to upload DATA_FILE. - # The periodic backup or admin force upload will do a more thorough sync. + upload_db_to_hf(specific_file=DATA_FILE) except Exception as e: logging.error(f"Error saving data to {DATA_FILE}: {e}", exc_info=True) @@ -318,12 +247,7 @@ def verify_telegram_auth_data(auth_data_str, bot_token): if not auth_data_str: return False, None - params = {} - try: - params = dict(urllib.parse.parse_qsl(auth_data_str)) - except Exception: # Broad exception for parsing issues - return False, None - + params = dict(urllib.parse.parse_qsl(auth_data_str)) if 'hash' not in params: return False, None @@ -358,44 +282,129 @@ MAIN_APP_TEMPLATE = '''