factorstudios commited on
Commit
0eb334a
·
verified ·
1 Parent(s): 482deae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -10
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import re
3
  import time
@@ -116,7 +117,7 @@ def list_rar_files_in_dataset() -> list:
116
  rar_files = [
117
  f for f in all_files
118
  if f.startswith(DATA_PATH)
119
- and (f.lower().endswith('.rar') or re.search(r'\.r\d{2}$', f, re.IGNORECASE))
120
  ]
121
 
122
  logging.info(f"[*] Found {len(rar_files)} RAR files")
@@ -128,6 +129,32 @@ def list_rar_files_in_dataset() -> list:
128
  logging.error(f"[!] Failed to list files: {e}")
129
  return []
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  # === Extract and Upload RAR ===
132
  async def extract_and_upload_rar(rar_file_path: str):
133
  try:
@@ -173,13 +200,22 @@ async def rar_processor_worker():
173
  try:
174
  logging.info("[*] Scanning for RAR files...")
175
  rar_files = list_rar_files_in_dataset()
 
 
 
 
 
 
 
176
 
177
- if rar_files:
178
- for rar_file in rar_files:
 
 
179
  await extract_and_upload_rar(rar_file)
180
  await asyncio.sleep(5) # Delay between files
181
  else:
182
- logging.info("[*] No RAR files found, waiting...")
183
 
184
  # Wait 60 seconds before next scan
185
  await asyncio.sleep(60)
@@ -210,20 +246,35 @@ def health():
210
 
211
  @app.get("/scan")
212
  def scan_rars():
213
- """Manually trigger RAR file scan"""
214
  rar_files = list_rar_files_in_dataset()
215
- return {"found": len(rar_files), "files": rar_files}
 
 
 
 
 
 
 
 
216
 
217
  @app.post("/extract-all")
218
  async def extract_all():
219
- """Manually trigger extraction of all RAR files"""
220
  rar_files = list_rar_files_in_dataset()
 
221
 
222
- if not rar_files:
223
- return {"message": "No RAR files found"}
 
 
 
 
 
 
224
 
225
  results = []
226
- for rar_file in rar_files:
227
  success = await extract_and_upload_rar(rar_file)
228
  results.append({"file": rar_file, "success": success})
229
  await asyncio.sleep(5)
 
1
+
2
  import os
3
  import re
4
  import time
 
117
  rar_files = [
118
  f for f in all_files
119
  if f.startswith(DATA_PATH)
120
+ and (f.lower().endswith(".rar") or re.search(r'\.r\d{2}$', f, re.IGNORECASE))
121
  ]
122
 
123
  logging.info(f"[*] Found {len(rar_files)} RAR files")
 
129
  logging.error(f"[!] Failed to list files: {e}")
130
  return []
131
 
132
+ # === List Extracted Courses in Dataset ===
133
+ def list_extracted_courses_in_dataset() -> set:
134
+ try:
135
+ logging.info(f"[*] Scanning dataset for extracted courses in {EXTRACTED_PATH}")
136
+ all_files = list_repo_files(
137
+ repo_id=REPO_ID,
138
+ repo_type="dataset",
139
+ token=HF_TOKEN
140
+ )
141
+
142
+ extracted_courses = set()
143
+ for f in all_files:
144
+ if f.startswith(EXTRACTED_PATH + '/') and len(f.split('/')) > len(EXTRACTED_PATH.split('/')):
145
+ # Extract the course name from the path
146
+ course_name = f.split('/')[len(EXTRACTED_PATH.split('/'))]
147
+ extracted_courses.add(course_name)
148
+
149
+ logging.info(f"[*] Found {len(extracted_courses)} extracted courses")
150
+ for ec in extracted_courses:
151
+ logging.info(f" - {ec}")
152
+
153
+ return extracted_courses
154
+ except Exception as e:
155
+ logging.error(f"[!] Failed to list extracted courses: {e}")
156
+ return set()
157
+
158
  # === Extract and Upload RAR ===
159
  async def extract_and_upload_rar(rar_file_path: str):
160
  try:
 
200
  try:
201
  logging.info("[*] Scanning for RAR files...")
202
  rar_files = list_rar_files_in_dataset()
203
+ extracted_courses = list_extracted_courses_in_dataset()
204
+
205
+ untouched_rar_files = []
206
+ for rar_file in rar_files:
207
+ course_name = extract_course_name(os.path.basename(rar_file))
208
+ if course_name not in extracted_courses:
209
+ untouched_rar_files.append(rar_file)
210
 
211
+ if untouched_rar_files:
212
+ logging.info(f"[*] Found {len(untouched_rar_files)} untouched RAR files.")
213
+ for rar_file in untouched_rar_files:
214
+ logging.info(f" - {rar_file}")
215
  await extract_and_upload_rar(rar_file)
216
  await asyncio.sleep(5) # Delay between files
217
  else:
218
+ logging.info("[*] No untouched RAR files found, waiting...")
219
 
220
  # Wait 60 seconds before next scan
221
  await asyncio.sleep(60)
 
246
 
247
  @app.get("/scan")
248
  def scan_rars():
249
+ """Manually trigger RAR file scan and identify untouched ones"""
250
  rar_files = list_rar_files_in_dataset()
251
+ extracted_courses = list_extracted_courses_in_dataset()
252
+
253
+ untouched_rar_files = []
254
+ for rar_file in rar_files:
255
+ course_name = extract_course_name(os.path.basename(rar_file))
256
+ if course_name not in extracted_courses:
257
+ untouched_rar_files.append(rar_file)
258
+
259
+ return {"total_rar_files": len(rar_files), "extracted_courses": len(extracted_courses), "untouched_rar_files": untouched_rar_files}
260
 
261
  @app.post("/extract-all")
262
  async def extract_all():
263
+ """Manually trigger extraction of all untouched RAR files"""
264
  rar_files = list_rar_files_in_dataset()
265
+ extracted_courses = list_extracted_courses_in_dataset()
266
 
267
+ untouched_rar_files = []
268
+ for rar_file in rar_files:
269
+ course_name = extract_course_name(os.path.basename(rar_file))
270
+ if course_name not in extracted_courses:
271
+ untouched_rar_files.append(rar_file)
272
+
273
+ if not untouched_rar_files:
274
+ return {"message": "No untouched RAR files found to extract"}
275
 
276
  results = []
277
+ for rar_file in untouched_rar_files:
278
  success = await extract_and_upload_rar(rar_file)
279
  results.append({"file": rar_file, "success": success})
280
  await asyncio.sleep(5)