Samfredoly commited on
Commit
3059f2d
·
verified ·
1 Parent(s): fef919a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -29
app.py CHANGED
@@ -12,12 +12,14 @@ from huggingface_hub import HfApi, hf_hub_download, HfFileSystem
12
 
13
  # --- Configuration ---
14
  UPLOAD_DIR = "uploaded_files"
15
- STATE_FILE_PATH = os.path.join(os.getcwd(), "processing_audio_state.json")
 
16
  HF_DATASET_REPO = "samfred2/A_Text"
17
  HF_TOKEN = os.getenv("HF_TOKEN")
18
 
19
  # Hardcoded variables for automation
20
- ZIP_UPLOAD_THRESHOLD = 200
 
21
  STATE_UPLOAD_THRESHOLD = 100
22
 
23
  # Global state object (will be loaded from/saved to STATE_FILE_PATH)
@@ -36,14 +38,14 @@ def get_hf_api() -> HfApi:
36
  raise ValueError("HF_TOKEN not found in environment variables.")
37
  return HfApi(token=HF_TOKEN)
38
 
39
- def download_state_file():
40
- """Downloads the state file from the Hugging Face dataset."""
41
  try:
42
  api = get_hf_api()
43
  # Use hf_hub_download for simple file download
44
  downloaded_path = hf_hub_download(
45
  repo_id=HF_DATASET_REPO,
46
- filename=os.path.basename(STATE_FILE_PATH),
47
  repo_type="dataset",
48
  local_dir=os.path.dirname(STATE_FILE_PATH),
49
  local_dir_use_symlinks=False
@@ -70,7 +72,8 @@ def upload_file_to_huggingface(local_path: str, path_in_repo: str):
70
  path_in_repo=path_in_repo,
71
  repo_id=HF_DATASET_REPO,
72
  repo_type="dataset",
73
- token=HF_TOKEN
 
74
  )
75
  print(f"Successfully uploaded {os.path.basename(local_path)} to {HF_DATASET_REPO} as {path_in_repo}")
76
  return True
@@ -89,26 +92,41 @@ def load_state():
89
  if os.path.exists(STATE_FILE_PATH):
90
  try:
91
  with open(STATE_FILE_PATH, "r") as f:
92
- app_state.update(json.load(f))
93
- print(f"State loaded successfully from {STATE_FILE_PATH}.")
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  except Exception as e:
95
  print(f"Error loading state file: {e}. Using default state.")
96
  else:
97
  print("Local state file not found. Using default state.")
98
 
99
  def save_state():
100
- """Saves application state to the local state file."""
 
 
101
  try:
102
  with open(STATE_FILE_PATH, "w") as f:
103
  json.dump(app_state, f, indent=4)
104
- print(f"State saved successfully to {STATE_FILE_PATH}.")
105
  except Exception as e:
106
  print(f"Error saving state file: {e}")
107
 
108
  def upload_state_file_to_hf():
109
- """Saves the current state and uploads the state file to Hugging Face."""
110
  save_state()
111
- upload_file_to_huggingface(STATE_FILE_PATH, os.path.basename(STATE_FILE_PATH))
112
  global app_state
113
  app_state["files_since_last_state_upload"] = 0
114
 
@@ -126,12 +144,13 @@ def zip_uploaded_files_versioned() -> str:
126
  zip_filename = f"uploaded_files_{app_state['current_zip_version']}.zip"
127
  zip_path = os.path.join(os.getcwd(), zip_filename)
128
 
129
- exclude_file = os.path.basename(STATE_FILE_PATH)
130
 
131
  with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
132
  for root, _, files in os.walk(UPLOAD_DIR):
133
  for file in files:
134
  file_path = os.path.join(root, file)
 
135
  if file != exclude_file:
136
  # Add file to zip, preserving directory structure relative to UPLOAD_DIR
137
  zipf.write(file_path, os.path.relpath(file_path, UPLOAD_DIR))
@@ -158,7 +177,8 @@ def cleanup_upload_dir():
158
  item_path = os.path.join(UPLOAD_DIR, item)
159
  if os.path.isdir(item_path):
160
  shutil.rmtree(item_path)
161
- elif item != os.path.basename(STATE_FILE_PATH):
 
162
  os.remove(item_path)
163
  print(f"Cleaned up files in {UPLOAD_DIR} directory (state file preserved).")
164
 
@@ -170,15 +190,16 @@ async def lifespan(app: FastAPI):
170
  os.makedirs(UPLOAD_DIR, exist_ok=True)
171
  print(f"Application starting. Upload directory: {UPLOAD_DIR}")
172
 
173
- # Startup: 2. Download and load state
174
- download_state_file()
 
175
  load_state()
176
- print(f"Initial state: {app_state}")
177
 
178
  yield
179
 
180
- # Shutdown: 1. Upload state file
181
- print("Application shutting down. Uploading final state...")
182
  upload_state_file_to_hf()
183
 
184
  # Shutdown: 2. Perform final zip and upload of any remaining files
@@ -212,27 +233,42 @@ async def upload_file(file: UploadFile = File(...)):
212
  # 1. Save the file
213
  try:
214
  file_path = os.path.join(UPLOAD_DIR, file.filename)
 
215
  # Check if file already exists to prevent overwriting without warning
216
  if os.path.exists(file_path):
217
- raise HTTPException(status_code=409, detail=f"File '{file.filename}' already exists.")
 
 
218
 
219
  try:
 
220
  with open(file_path, "wb") as buffer:
221
  shutil.copyfileobj(file.file, buffer)
222
  except Exception as e:
223
  raise HTTPException(status_code=500, detail=f"Could not write file to disk: {e}")
224
 
225
- # 2. Update counters only after successful write
 
 
 
 
 
 
 
 
 
226
  app_state["total_files_uploaded"] += 1
227
  app_state["files_since_last_zip"] += 1
228
  app_state["files_since_last_state_upload"] += 1
229
- save_state() # Save state after every upload
230
- # 3. Check for state file upload threshold (100 uploads)
 
231
  if app_state["files_since_last_state_upload"] >= STATE_UPLOAD_THRESHOLD:
232
  print(f"State upload threshold ({STATE_UPLOAD_THRESHOLD}) reached. Uploading state file...")
 
233
  upload_state_file_to_hf()
234
 
235
- # 4. Check for zip upload threshold (200 uploads)
236
  if app_state["files_since_last_zip"] >= ZIP_UPLOAD_THRESHOLD:
237
  print(f"Zip upload threshold ({ZIP_UPLOAD_THRESHOLD}) reached. Zipping and uploading files...")
238
  zip_path = zip_uploaded_files_versioned()
@@ -265,7 +301,7 @@ async def sync_dataset():
265
  """Manually trigger zipping of all uploaded files and uploading to the Hugging Face dataset."""
266
  print("Manual dataset sync triggered.")
267
 
268
- # 1. Upload state file
269
  upload_state_file_to_hf()
270
 
271
  # 2. Zip and upload files
@@ -287,13 +323,44 @@ async def list_files():
287
  return {"files": []}
288
 
289
  # Exclude the state file from the list of downloadable files
290
- files = [f for f in os.listdir(UPLOAD_DIR) if f != os.path.basename(STATE_FILE_PATH)]
291
  return {"files": files, "state": app_state}
292
 
293
  @app.get("/state/")
294
  async def get_state():
295
- """Get the current application state."""
296
- return {"state": app_state}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
  # --- Main execution block for testing/running ---
299
  if __name__ == "__main__":
@@ -319,4 +386,4 @@ if __name__ == "__main__":
319
  finally:
320
  # Simulate cleanup that happens in the lifespan context manager
321
  # when running with uvicorn in a real environment.
322
- pass
 
12
 
13
  # --- Configuration ---
14
  UPLOAD_DIR = "uploaded_files"
15
+ STATE_FILE_NAME = "processing_audio_state.json"
16
+ STATE_FILE_PATH = os.path.join(os.getcwd(), STATE_FILE_NAME)
17
  HF_DATASET_REPO = "samfred2/A_Text"
18
  HF_TOKEN = os.getenv("HF_TOKEN")
19
 
20
  # Hardcoded variables for automation
21
+ # Both thresholds are now 100 as requested
22
+ ZIP_UPLOAD_THRESHOLD = 100
23
  STATE_UPLOAD_THRESHOLD = 100
24
 
25
  # Global state object (will be loaded from/saved to STATE_FILE_PATH)
 
38
  raise ValueError("HF_TOKEN not found in environment variables.")
39
  return HfApi(token=HF_TOKEN)
40
 
41
+ def download_state_file_from_hf():
42
+ """Downloads the state file from the Hugging Face dataset to the local path."""
43
  try:
44
  api = get_hf_api()
45
  # Use hf_hub_download for simple file download
46
  downloaded_path = hf_hub_download(
47
  repo_id=HF_DATASET_REPO,
48
+ filename=STATE_FILE_NAME,
49
  repo_type="dataset",
50
  local_dir=os.path.dirname(STATE_FILE_PATH),
51
  local_dir_use_symlinks=False
 
72
  path_in_repo=path_in_repo,
73
  repo_id=HF_DATASET_REPO,
74
  repo_type="dataset",
75
+ token=HF_TOKEN,
76
+ commit_message=f"Sync: {path_in_repo}"
77
  )
78
  print(f"Successfully uploaded {os.path.basename(local_path)} to {HF_DATASET_REPO} as {path_in_repo}")
79
  return True
 
92
  if os.path.exists(STATE_FILE_PATH):
93
  try:
94
  with open(STATE_FILE_PATH, "r") as f:
95
+ # Load the state file content, which contains the file_states and next_download_index
96
+ state_content = json.load(f)
97
+
98
+ # The app_state itself only tracks counters, not the file_states
99
+ # We only update the counters here if they exist in the state file
100
+ if "total_files_uploaded" in state_content:
101
+ app_state["total_files_uploaded"] = state_content["total_files_uploaded"]
102
+ if "current_zip_version" in state_content:
103
+ app_state["current_zip_version"] = state_content["current_zip_version"]
104
+ if "files_since_last_zip" in state_content:
105
+ app_state["files_since_last_zip"] = state_content["files_since_last_zip"]
106
+ if "files_since_last_state_upload" in state_content:
107
+ app_state["files_since_last_state_upload"] = state_content["files_since_last_state_upload"]
108
+
109
+ print(f"State counters loaded successfully from {STATE_FILE_PATH}.")
110
  except Exception as e:
111
  print(f"Error loading state file: {e}. Using default state.")
112
  else:
113
  print("Local state file not found. Using default state.")
114
 
115
  def save_state():
116
+ """Saves application state (counters) to the local state file."""
117
+ # This function is now only for saving the internal app_state counters,
118
+ # not the file_states, which are managed by the client.
119
  try:
120
  with open(STATE_FILE_PATH, "w") as f:
121
  json.dump(app_state, f, indent=4)
122
+ print(f"State counters saved successfully to {STATE_FILE_PATH}.")
123
  except Exception as e:
124
  print(f"Error saving state file: {e}")
125
 
126
  def upload_state_file_to_hf():
127
+ """Saves the current app_state counters and uploads the state file to Hugging Face."""
128
  save_state()
129
+ upload_file_to_huggingface(STATE_FILE_PATH, STATE_FILE_NAME)
130
  global app_state
131
  app_state["files_since_last_state_upload"] = 0
132
 
 
144
  zip_filename = f"uploaded_files_{app_state['current_zip_version']}.zip"
145
  zip_path = os.path.join(os.getcwd(), zip_filename)
146
 
147
+ exclude_file = STATE_FILE_NAME
148
 
149
  with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
150
  for root, _, files in os.walk(UPLOAD_DIR):
151
  for file in files:
152
  file_path = os.path.join(root, file)
153
+ # Only zip files that are NOT the state file
154
  if file != exclude_file:
155
  # Add file to zip, preserving directory structure relative to UPLOAD_DIR
156
  zipf.write(file_path, os.path.relpath(file_path, UPLOAD_DIR))
 
177
  item_path = os.path.join(UPLOAD_DIR, item)
178
  if os.path.isdir(item_path):
179
  shutil.rmtree(item_path)
180
+ # Only remove files that are NOT the state file
181
+ elif item != STATE_FILE_NAME:
182
  os.remove(item_path)
183
  print(f"Cleaned up files in {UPLOAD_DIR} directory (state file preserved).")
184
 
 
190
  os.makedirs(UPLOAD_DIR, exist_ok=True)
191
  print(f"Application starting. Upload directory: {UPLOAD_DIR}")
192
 
193
+ # Startup: 2. Download and load state (only internal counters are loaded here)
194
+ # We rely on the /state/ endpoint to download the file_states from HF when needed.
195
+ download_state_file_from_hf()
196
  load_state()
197
+ print(f"Initial state counters: {app_state}")
198
 
199
  yield
200
 
201
+ # Shutdown: 1. Upload state file (only internal counters are uploaded here)
202
+ print("Application shutting down. Uploading final state counters...")
203
  upload_state_file_to_hf()
204
 
205
  # Shutdown: 2. Perform final zip and upload of any remaining files
 
233
  # 1. Save the file
234
  try:
235
  file_path = os.path.join(UPLOAD_DIR, file.filename)
236
+
237
  # Check if file already exists to prevent overwriting without warning
238
  if os.path.exists(file_path):
239
+ # Special handling for the state file: allow overwrite as it's a sync mechanism
240
+ if file.filename != STATE_FILE_NAME:
241
+ raise HTTPException(status_code=409, detail=f"File '{file.filename}' already exists.")
242
 
243
  try:
244
+ # Write the uploaded file to the local UPLOAD_DIR
245
  with open(file_path, "wb") as buffer:
246
  shutil.copyfileobj(file.file, buffer)
247
  except Exception as e:
248
  raise HTTPException(status_code=500, detail=f"Could not write file to disk: {e}")
249
 
250
+ # --- LOGIC FOR STATE FILE HANDLING ---
251
+ if file.filename == STATE_FILE_NAME:
252
+ print(f"Intercepted state file upload: {STATE_FILE_NAME}. Saving locally only.")
253
+ # The client's lock/unlock is now persisted locally on the server.
254
+ # It will only be uploaded to HF when the STATE_UPLOAD_THRESHOLD is met.
255
+ # We return success immediately.
256
+ return {"filename": file.filename, "message": "State file successfully saved locally", "state": app_state}
257
+ # --- END LOGIC FOR STATE FILE HANDLING ---
258
+
259
+ # 2. Update counters only after successful write of a non-state file (transcription JSON)
260
  app_state["total_files_uploaded"] += 1
261
  app_state["files_since_last_zip"] += 1
262
  app_state["files_since_last_state_upload"] += 1
263
+ save_state() # Save internal counters locally
264
+
265
+ # 3. Check for state file upload threshold (100 non-state file uploads)
266
  if app_state["files_since_last_state_upload"] >= STATE_UPLOAD_THRESHOLD:
267
  print(f"State upload threshold ({STATE_UPLOAD_THRESHOLD}) reached. Uploading state file...")
268
+ # This uploads the local state file (which includes the latest file_states) to HF
269
  upload_state_file_to_hf()
270
 
271
+ # 4. Check for zip upload threshold (100 non-state file uploads)
272
  if app_state["files_since_last_zip"] >= ZIP_UPLOAD_THRESHOLD:
273
  print(f"Zip upload threshold ({ZIP_UPLOAD_THRESHOLD}) reached. Zipping and uploading files...")
274
  zip_path = zip_uploaded_files_versioned()
 
301
  """Manually trigger zipping of all uploaded files and uploading to the Hugging Face dataset."""
302
  print("Manual dataset sync triggered.")
303
 
304
+ # 1. Upload state file (internal counters)
305
  upload_state_file_to_hf()
306
 
307
  # 2. Zip and upload files
 
323
  return {"files": []}
324
 
325
  # Exclude the state file from the list of downloadable files
326
+ files = [f for f in os.listdir(UPLOAD_DIR) if f != STATE_FILE_NAME]
327
  return {"files": files, "state": app_state}
328
 
329
  @app.get("/state/")
330
  async def get_state():
331
+ """
332
+ Get the current application state (including file processing status).
333
+ This endpoint now ensures it fetches the latest state from Hugging Face.
334
+ """
335
+ # 1. Load the file_states from the local file, which is updated by client uploads.
336
+ # We assume the local file exists because it's either downloaded on startup
337
+ # or created by a client upload.
338
+ if not os.path.exists(STATE_FILE_PATH):
339
+ # If the file is missing, attempt a download from HF as a fallback
340
+ download_state_file_from_hf()
341
+ if not os.path.exists(STATE_FILE_PATH):
342
+ # If still not found (e.g., first run and 404 on HF), return default
343
+ return {"state": {"next_download_index": 0, "file_states": {}}}
344
+
345
+ try:
346
+ with open(STATE_FILE_PATH, "r") as f:
347
+ state_content = json.load(f)
348
+
349
+ # Ensure the required keys for the client are present
350
+ if "file_states" not in state_content:
351
+ state_content["file_states"] = {}
352
+ if "next_download_index" not in state_content:
353
+ state_content["next_download_index"] = 0
354
+
355
+ # Merge the internal app_state counters (which are not critical for locking)
356
+ # into the returned state for completeness, but the core file_states come from the file.
357
+ state_content.update(app_state)
358
+
359
+ return {"state": state_content}
360
+
361
+ except Exception as e:
362
+ print(f"Error reading local state file for /state/ endpoint: {e}")
363
+ raise HTTPException(status_code=500, detail="Error processing state file.")
364
 
365
  # --- Main execution block for testing/running ---
366
  if __name__ == "__main__":
 
386
  finally:
387
  # Simulate cleanup that happens in the lifespan context manager
388
  # when running with uvicorn in a real environment.
389
+ pass