Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,12 +12,14 @@ from huggingface_hub import HfApi, hf_hub_download, HfFileSystem
|
|
| 12 |
|
| 13 |
# --- Configuration ---
|
| 14 |
UPLOAD_DIR = "uploaded_files"
|
| 15 |
-
|
|
|
|
| 16 |
HF_DATASET_REPO = "samfred2/A_Text"
|
| 17 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 18 |
|
| 19 |
# Hardcoded variables for automation
|
| 20 |
-
|
|
|
|
| 21 |
STATE_UPLOAD_THRESHOLD = 100
|
| 22 |
|
| 23 |
# Global state object (will be loaded from/saved to STATE_FILE_PATH)
|
|
@@ -36,14 +38,14 @@ def get_hf_api() -> HfApi:
|
|
| 36 |
raise ValueError("HF_TOKEN not found in environment variables.")
|
| 37 |
return HfApi(token=HF_TOKEN)
|
| 38 |
|
| 39 |
-
def
|
| 40 |
-
"""Downloads the state file from the Hugging Face dataset."""
|
| 41 |
try:
|
| 42 |
api = get_hf_api()
|
| 43 |
# Use hf_hub_download for simple file download
|
| 44 |
downloaded_path = hf_hub_download(
|
| 45 |
repo_id=HF_DATASET_REPO,
|
| 46 |
-
filename=
|
| 47 |
repo_type="dataset",
|
| 48 |
local_dir=os.path.dirname(STATE_FILE_PATH),
|
| 49 |
local_dir_use_symlinks=False
|
|
@@ -70,7 +72,8 @@ def upload_file_to_huggingface(local_path: str, path_in_repo: str):
|
|
| 70 |
path_in_repo=path_in_repo,
|
| 71 |
repo_id=HF_DATASET_REPO,
|
| 72 |
repo_type="dataset",
|
| 73 |
-
token=HF_TOKEN
|
|
|
|
| 74 |
)
|
| 75 |
print(f"Successfully uploaded {os.path.basename(local_path)} to {HF_DATASET_REPO} as {path_in_repo}")
|
| 76 |
return True
|
|
@@ -89,26 +92,41 @@ def load_state():
|
|
| 89 |
if os.path.exists(STATE_FILE_PATH):
|
| 90 |
try:
|
| 91 |
with open(STATE_FILE_PATH, "r") as f:
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
except Exception as e:
|
| 95 |
print(f"Error loading state file: {e}. Using default state.")
|
| 96 |
else:
|
| 97 |
print("Local state file not found. Using default state.")
|
| 98 |
|
| 99 |
def save_state():
|
| 100 |
-
"""Saves application state to the local state file."""
|
|
|
|
|
|
|
| 101 |
try:
|
| 102 |
with open(STATE_FILE_PATH, "w") as f:
|
| 103 |
json.dump(app_state, f, indent=4)
|
| 104 |
-
print(f"State saved successfully to {STATE_FILE_PATH}.")
|
| 105 |
except Exception as e:
|
| 106 |
print(f"Error saving state file: {e}")
|
| 107 |
|
| 108 |
def upload_state_file_to_hf():
|
| 109 |
-
"""Saves the current
|
| 110 |
save_state()
|
| 111 |
-
upload_file_to_huggingface(STATE_FILE_PATH,
|
| 112 |
global app_state
|
| 113 |
app_state["files_since_last_state_upload"] = 0
|
| 114 |
|
|
@@ -126,12 +144,13 @@ def zip_uploaded_files_versioned() -> str:
|
|
| 126 |
zip_filename = f"uploaded_files_{app_state['current_zip_version']}.zip"
|
| 127 |
zip_path = os.path.join(os.getcwd(), zip_filename)
|
| 128 |
|
| 129 |
-
exclude_file =
|
| 130 |
|
| 131 |
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
| 132 |
for root, _, files in os.walk(UPLOAD_DIR):
|
| 133 |
for file in files:
|
| 134 |
file_path = os.path.join(root, file)
|
|
|
|
| 135 |
if file != exclude_file:
|
| 136 |
# Add file to zip, preserving directory structure relative to UPLOAD_DIR
|
| 137 |
zipf.write(file_path, os.path.relpath(file_path, UPLOAD_DIR))
|
|
@@ -158,7 +177,8 @@ def cleanup_upload_dir():
|
|
| 158 |
item_path = os.path.join(UPLOAD_DIR, item)
|
| 159 |
if os.path.isdir(item_path):
|
| 160 |
shutil.rmtree(item_path)
|
| 161 |
-
|
|
|
|
| 162 |
os.remove(item_path)
|
| 163 |
print(f"Cleaned up files in {UPLOAD_DIR} directory (state file preserved).")
|
| 164 |
|
|
@@ -170,15 +190,16 @@ async def lifespan(app: FastAPI):
|
|
| 170 |
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
| 171 |
print(f"Application starting. Upload directory: {UPLOAD_DIR}")
|
| 172 |
|
| 173 |
-
# Startup: 2. Download and load state
|
| 174 |
-
|
|
|
|
| 175 |
load_state()
|
| 176 |
-
print(f"Initial state: {app_state}")
|
| 177 |
|
| 178 |
yield
|
| 179 |
|
| 180 |
-
# Shutdown: 1. Upload state file
|
| 181 |
-
print("Application shutting down. Uploading final state...")
|
| 182 |
upload_state_file_to_hf()
|
| 183 |
|
| 184 |
# Shutdown: 2. Perform final zip and upload of any remaining files
|
|
@@ -212,27 +233,42 @@ async def upload_file(file: UploadFile = File(...)):
|
|
| 212 |
# 1. Save the file
|
| 213 |
try:
|
| 214 |
file_path = os.path.join(UPLOAD_DIR, file.filename)
|
|
|
|
| 215 |
# Check if file already exists to prevent overwriting without warning
|
| 216 |
if os.path.exists(file_path):
|
| 217 |
-
|
|
|
|
|
|
|
| 218 |
|
| 219 |
try:
|
|
|
|
| 220 |
with open(file_path, "wb") as buffer:
|
| 221 |
shutil.copyfileobj(file.file, buffer)
|
| 222 |
except Exception as e:
|
| 223 |
raise HTTPException(status_code=500, detail=f"Could not write file to disk: {e}")
|
| 224 |
|
| 225 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
app_state["total_files_uploaded"] += 1
|
| 227 |
app_state["files_since_last_zip"] += 1
|
| 228 |
app_state["files_since_last_state_upload"] += 1
|
| 229 |
-
save_state() # Save
|
| 230 |
-
|
|
|
|
| 231 |
if app_state["files_since_last_state_upload"] >= STATE_UPLOAD_THRESHOLD:
|
| 232 |
print(f"State upload threshold ({STATE_UPLOAD_THRESHOLD}) reached. Uploading state file...")
|
|
|
|
| 233 |
upload_state_file_to_hf()
|
| 234 |
|
| 235 |
-
# 4. Check for zip upload threshold (
|
| 236 |
if app_state["files_since_last_zip"] >= ZIP_UPLOAD_THRESHOLD:
|
| 237 |
print(f"Zip upload threshold ({ZIP_UPLOAD_THRESHOLD}) reached. Zipping and uploading files...")
|
| 238 |
zip_path = zip_uploaded_files_versioned()
|
|
@@ -265,7 +301,7 @@ async def sync_dataset():
|
|
| 265 |
"""Manually trigger zipping of all uploaded files and uploading to the Hugging Face dataset."""
|
| 266 |
print("Manual dataset sync triggered.")
|
| 267 |
|
| 268 |
-
# 1. Upload state file
|
| 269 |
upload_state_file_to_hf()
|
| 270 |
|
| 271 |
# 2. Zip and upload files
|
|
@@ -287,13 +323,44 @@ async def list_files():
|
|
| 287 |
return {"files": []}
|
| 288 |
|
| 289 |
# Exclude the state file from the list of downloadable files
|
| 290 |
-
files = [f for f in os.listdir(UPLOAD_DIR) if f !=
|
| 291 |
return {"files": files, "state": app_state}
|
| 292 |
|
| 293 |
@app.get("/state/")
|
| 294 |
async def get_state():
|
| 295 |
-
"""
|
| 296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
|
| 298 |
# --- Main execution block for testing/running ---
|
| 299 |
if __name__ == "__main__":
|
|
@@ -319,4 +386,4 @@ if __name__ == "__main__":
|
|
| 319 |
finally:
|
| 320 |
# Simulate cleanup that happens in the lifespan context manager
|
| 321 |
# when running with uvicorn in a real environment.
|
| 322 |
-
pass
|
|
|
|
| 12 |
|
| 13 |
# --- Configuration ---
|
| 14 |
UPLOAD_DIR = "uploaded_files"
|
| 15 |
+
STATE_FILE_NAME = "processing_audio_state.json"
|
| 16 |
+
STATE_FILE_PATH = os.path.join(os.getcwd(), STATE_FILE_NAME)
|
| 17 |
HF_DATASET_REPO = "samfred2/A_Text"
|
| 18 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 19 |
|
| 20 |
# Hardcoded variables for automation
|
| 21 |
+
# Both thresholds are now 100 as requested
|
| 22 |
+
ZIP_UPLOAD_THRESHOLD = 100
|
| 23 |
STATE_UPLOAD_THRESHOLD = 100
|
| 24 |
|
| 25 |
# Global state object (will be loaded from/saved to STATE_FILE_PATH)
|
|
|
|
| 38 |
raise ValueError("HF_TOKEN not found in environment variables.")
|
| 39 |
return HfApi(token=HF_TOKEN)
|
| 40 |
|
| 41 |
+
def download_state_file_from_hf():
|
| 42 |
+
"""Downloads the state file from the Hugging Face dataset to the local path."""
|
| 43 |
try:
|
| 44 |
api = get_hf_api()
|
| 45 |
# Use hf_hub_download for simple file download
|
| 46 |
downloaded_path = hf_hub_download(
|
| 47 |
repo_id=HF_DATASET_REPO,
|
| 48 |
+
filename=STATE_FILE_NAME,
|
| 49 |
repo_type="dataset",
|
| 50 |
local_dir=os.path.dirname(STATE_FILE_PATH),
|
| 51 |
local_dir_use_symlinks=False
|
|
|
|
| 72 |
path_in_repo=path_in_repo,
|
| 73 |
repo_id=HF_DATASET_REPO,
|
| 74 |
repo_type="dataset",
|
| 75 |
+
token=HF_TOKEN,
|
| 76 |
+
commit_message=f"Sync: {path_in_repo}"
|
| 77 |
)
|
| 78 |
print(f"Successfully uploaded {os.path.basename(local_path)} to {HF_DATASET_REPO} as {path_in_repo}")
|
| 79 |
return True
|
|
|
|
| 92 |
if os.path.exists(STATE_FILE_PATH):
|
| 93 |
try:
|
| 94 |
with open(STATE_FILE_PATH, "r") as f:
|
| 95 |
+
# Load the state file content, which contains the file_states and next_download_index
|
| 96 |
+
state_content = json.load(f)
|
| 97 |
+
|
| 98 |
+
# The app_state itself only tracks counters, not the file_states
|
| 99 |
+
# We only update the counters here if they exist in the state file
|
| 100 |
+
if "total_files_uploaded" in state_content:
|
| 101 |
+
app_state["total_files_uploaded"] = state_content["total_files_uploaded"]
|
| 102 |
+
if "current_zip_version" in state_content:
|
| 103 |
+
app_state["current_zip_version"] = state_content["current_zip_version"]
|
| 104 |
+
if "files_since_last_zip" in state_content:
|
| 105 |
+
app_state["files_since_last_zip"] = state_content["files_since_last_zip"]
|
| 106 |
+
if "files_since_last_state_upload" in state_content:
|
| 107 |
+
app_state["files_since_last_state_upload"] = state_content["files_since_last_state_upload"]
|
| 108 |
+
|
| 109 |
+
print(f"State counters loaded successfully from {STATE_FILE_PATH}.")
|
| 110 |
except Exception as e:
|
| 111 |
print(f"Error loading state file: {e}. Using default state.")
|
| 112 |
else:
|
| 113 |
print("Local state file not found. Using default state.")
|
| 114 |
|
| 115 |
def save_state():
|
| 116 |
+
"""Saves application state (counters) to the local state file."""
|
| 117 |
+
# This function is now only for saving the internal app_state counters,
|
| 118 |
+
# not the file_states, which are managed by the client.
|
| 119 |
try:
|
| 120 |
with open(STATE_FILE_PATH, "w") as f:
|
| 121 |
json.dump(app_state, f, indent=4)
|
| 122 |
+
print(f"State counters saved successfully to {STATE_FILE_PATH}.")
|
| 123 |
except Exception as e:
|
| 124 |
print(f"Error saving state file: {e}")
|
| 125 |
|
| 126 |
def upload_state_file_to_hf():
|
| 127 |
+
"""Saves the current app_state counters and uploads the state file to Hugging Face."""
|
| 128 |
save_state()
|
| 129 |
+
upload_file_to_huggingface(STATE_FILE_PATH, STATE_FILE_NAME)
|
| 130 |
global app_state
|
| 131 |
app_state["files_since_last_state_upload"] = 0
|
| 132 |
|
|
|
|
| 144 |
zip_filename = f"uploaded_files_{app_state['current_zip_version']}.zip"
|
| 145 |
zip_path = os.path.join(os.getcwd(), zip_filename)
|
| 146 |
|
| 147 |
+
exclude_file = STATE_FILE_NAME
|
| 148 |
|
| 149 |
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
| 150 |
for root, _, files in os.walk(UPLOAD_DIR):
|
| 151 |
for file in files:
|
| 152 |
file_path = os.path.join(root, file)
|
| 153 |
+
# Only zip files that are NOT the state file
|
| 154 |
if file != exclude_file:
|
| 155 |
# Add file to zip, preserving directory structure relative to UPLOAD_DIR
|
| 156 |
zipf.write(file_path, os.path.relpath(file_path, UPLOAD_DIR))
|
|
|
|
| 177 |
item_path = os.path.join(UPLOAD_DIR, item)
|
| 178 |
if os.path.isdir(item_path):
|
| 179 |
shutil.rmtree(item_path)
|
| 180 |
+
# Only remove files that are NOT the state file
|
| 181 |
+
elif item != STATE_FILE_NAME:
|
| 182 |
os.remove(item_path)
|
| 183 |
print(f"Cleaned up files in {UPLOAD_DIR} directory (state file preserved).")
|
| 184 |
|
|
|
|
| 190 |
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
| 191 |
print(f"Application starting. Upload directory: {UPLOAD_DIR}")
|
| 192 |
|
| 193 |
+
# Startup: 2. Download and load state (only internal counters are loaded here)
|
| 194 |
+
# We rely on the /state/ endpoint to download the file_states from HF when needed.
|
| 195 |
+
download_state_file_from_hf()
|
| 196 |
load_state()
|
| 197 |
+
print(f"Initial state counters: {app_state}")
|
| 198 |
|
| 199 |
yield
|
| 200 |
|
| 201 |
+
# Shutdown: 1. Upload state file (only internal counters are uploaded here)
|
| 202 |
+
print("Application shutting down. Uploading final state counters...")
|
| 203 |
upload_state_file_to_hf()
|
| 204 |
|
| 205 |
# Shutdown: 2. Perform final zip and upload of any remaining files
|
|
|
|
| 233 |
# 1. Save the file
|
| 234 |
try:
|
| 235 |
file_path = os.path.join(UPLOAD_DIR, file.filename)
|
| 236 |
+
|
| 237 |
# Check if file already exists to prevent overwriting without warning
|
| 238 |
if os.path.exists(file_path):
|
| 239 |
+
# Special handling for the state file: allow overwrite as it's a sync mechanism
|
| 240 |
+
if file.filename != STATE_FILE_NAME:
|
| 241 |
+
raise HTTPException(status_code=409, detail=f"File '{file.filename}' already exists.")
|
| 242 |
|
| 243 |
try:
|
| 244 |
+
# Write the uploaded file to the local UPLOAD_DIR
|
| 245 |
with open(file_path, "wb") as buffer:
|
| 246 |
shutil.copyfileobj(file.file, buffer)
|
| 247 |
except Exception as e:
|
| 248 |
raise HTTPException(status_code=500, detail=f"Could not write file to disk: {e}")
|
| 249 |
|
| 250 |
+
# --- LOGIC FOR STATE FILE HANDLING ---
|
| 251 |
+
if file.filename == STATE_FILE_NAME:
|
| 252 |
+
print(f"Intercepted state file upload: {STATE_FILE_NAME}. Saving locally only.")
|
| 253 |
+
# The client's lock/unlock is now persisted locally on the server.
|
| 254 |
+
# It will only be uploaded to HF when the STATE_UPLOAD_THRESHOLD is met.
|
| 255 |
+
# We return success immediately.
|
| 256 |
+
return {"filename": file.filename, "message": "State file successfully saved locally", "state": app_state}
|
| 257 |
+
# --- END LOGIC FOR STATE FILE HANDLING ---
|
| 258 |
+
|
| 259 |
+
# 2. Update counters only after successful write of a non-state file (transcription JSON)
|
| 260 |
app_state["total_files_uploaded"] += 1
|
| 261 |
app_state["files_since_last_zip"] += 1
|
| 262 |
app_state["files_since_last_state_upload"] += 1
|
| 263 |
+
save_state() # Save internal counters locally
|
| 264 |
+
|
| 265 |
+
# 3. Check for state file upload threshold (100 non-state file uploads)
|
| 266 |
if app_state["files_since_last_state_upload"] >= STATE_UPLOAD_THRESHOLD:
|
| 267 |
print(f"State upload threshold ({STATE_UPLOAD_THRESHOLD}) reached. Uploading state file...")
|
| 268 |
+
# This uploads the local state file (which includes the latest file_states) to HF
|
| 269 |
upload_state_file_to_hf()
|
| 270 |
|
| 271 |
+
# 4. Check for zip upload threshold (100 non-state file uploads)
|
| 272 |
if app_state["files_since_last_zip"] >= ZIP_UPLOAD_THRESHOLD:
|
| 273 |
print(f"Zip upload threshold ({ZIP_UPLOAD_THRESHOLD}) reached. Zipping and uploading files...")
|
| 274 |
zip_path = zip_uploaded_files_versioned()
|
|
|
|
| 301 |
"""Manually trigger zipping of all uploaded files and uploading to the Hugging Face dataset."""
|
| 302 |
print("Manual dataset sync triggered.")
|
| 303 |
|
| 304 |
+
# 1. Upload state file (internal counters)
|
| 305 |
upload_state_file_to_hf()
|
| 306 |
|
| 307 |
# 2. Zip and upload files
|
|
|
|
| 323 |
return {"files": []}
|
| 324 |
|
| 325 |
# Exclude the state file from the list of downloadable files
|
| 326 |
+
files = [f for f in os.listdir(UPLOAD_DIR) if f != STATE_FILE_NAME]
|
| 327 |
return {"files": files, "state": app_state}
|
| 328 |
|
| 329 |
@app.get("/state/")
|
| 330 |
async def get_state():
|
| 331 |
+
"""
|
| 332 |
+
Get the current application state (including file processing status).
|
| 333 |
+
This endpoint now ensures it fetches the latest state from Hugging Face.
|
| 334 |
+
"""
|
| 335 |
+
# 1. Load the file_states from the local file, which is updated by client uploads.
|
| 336 |
+
# We assume the local file exists because it's either downloaded on startup
|
| 337 |
+
# or created by a client upload.
|
| 338 |
+
if not os.path.exists(STATE_FILE_PATH):
|
| 339 |
+
# If the file is missing, attempt a download from HF as a fallback
|
| 340 |
+
download_state_file_from_hf()
|
| 341 |
+
if not os.path.exists(STATE_FILE_PATH):
|
| 342 |
+
# If still not found (e.g., first run and 404 on HF), return default
|
| 343 |
+
return {"state": {"next_download_index": 0, "file_states": {}}}
|
| 344 |
+
|
| 345 |
+
try:
|
| 346 |
+
with open(STATE_FILE_PATH, "r") as f:
|
| 347 |
+
state_content = json.load(f)
|
| 348 |
+
|
| 349 |
+
# Ensure the required keys for the client are present
|
| 350 |
+
if "file_states" not in state_content:
|
| 351 |
+
state_content["file_states"] = {}
|
| 352 |
+
if "next_download_index" not in state_content:
|
| 353 |
+
state_content["next_download_index"] = 0
|
| 354 |
+
|
| 355 |
+
# Merge the internal app_state counters (which are not critical for locking)
|
| 356 |
+
# into the returned state for completeness, but the core file_states come from the file.
|
| 357 |
+
state_content.update(app_state)
|
| 358 |
+
|
| 359 |
+
return {"state": state_content}
|
| 360 |
+
|
| 361 |
+
except Exception as e:
|
| 362 |
+
print(f"Error reading local state file for /state/ endpoint: {e}")
|
| 363 |
+
raise HTTPException(status_code=500, detail="Error processing state file.")
|
| 364 |
|
| 365 |
# --- Main execution block for testing/running ---
|
| 366 |
if __name__ == "__main__":
|
|
|
|
| 386 |
finally:
|
| 387 |
# Simulate cleanup that happens in the lifespan context manager
|
| 388 |
# when running with uvicorn in a real environment.
|
| 389 |
+
pass
|