Zeggai Abdellah commited on
Commit
4a8836b
·
1 Parent(s): 8517947

update the quation to work with all files

Browse files
Files changed (30) hide show
  1. app.py +28 -9
  2. chunk/chunks_WER8624_241_unstructured_api_output.json +0 -0
  3. chunk/chunks_WER8647_521_unstructured_api_output.json +0 -0
  4. chunk/chunks_WER8827_269_unstructured_api_output.json +0 -0
  5. chunk/chunks_WER8839_413_unstructured_api_output.json +0 -0
  6. chunk/chunks_WER8925_unstructured_api_output.json +0 -0
  7. chunk/chunks_WER9008_57_unstructured_api_output.json +0 -0
  8. chunk/chunks_WER9009_69_unstructured_api_output.json +0 -0
  9. chunk/chunks_WER9018_185_unstructured_api_output.json +0 -0
  10. chunk/chunks_WER9035_unstructured_api_output.json +0 -0
  11. chunk/chunks_WER9039_505_unstructured_api_output.json +0 -0
  12. chunk/chunks_WER9206_unstructured_api_output.json +0 -0
  13. chunk/chunks_WER9217_unstructured_api_output.json +0 -0
  14. chunk/chunks_WER9225_unstructured_api_output.json +0 -0
  15. chunk/chunks_WER9227_unstructured_api_output.json +0 -0
  16. chunk/chunks_WER9231_unstructured_api_output.json +0 -0
  17. chunk/chunks_WER9234_unstructured_api_output.json +0 -0
  18. chunk/chunks_WER9308_unstructured_api_output.json +0 -0
  19. chunk/chunks_WER9313_unstructured_api_output.json +0 -0
  20. chunk/chunks_WER9316_unstructured_api_output.json +0 -0
  21. chunk/chunks_WER9527_unstructured_api_output.json +0 -0
  22. chunk/chunks_WER9623_unstructured_api_output.json +0 -0
  23. chunk/chunks_WER9719_unstructured_api_output.json +0 -0
  24. chunk/chunks_WER9720_unstructured_api_output.json +0 -0
  25. chunk/chunks_WER9725_unstructured_api_output.json +0 -0
  26. chunk/chunks_WER9740_unstructured_api_output.json +0 -0
  27. chunk/chunks_WER9750_unstructured_api_output.json +0 -0
  28. chunk/chunks_WER9901_02_unstructured_api_output.json +0 -0
  29. chunk/chunks_With_bboxes_elements_Immunization_unstructured_api_output_mr_.json +0 -0
  30. chunks.json → chunk/chunks_filtered_elements_with_bboxes.json +0 -0
app.py CHANGED
@@ -9,6 +9,7 @@ from datetime import datetime
9
  from huggingface_hub import HfApi # For file persistence in Spaces
10
  import os
11
  import threading
 
12
 
13
  # Load environment variables from .env file
14
  load_dotenv()
@@ -195,7 +196,7 @@ def save_dataset_to_space(dataset: Dict, filename: str):
195
  @app.get("/generate-questions")
196
  async def generate_questions():
197
  """
198
- Endpoint to generate questions from the vaccine guide chunks
199
  """
200
  global generation_status
201
 
@@ -219,23 +220,41 @@ async def generate_questions():
219
  generation_status["result_file"] = None
220
  generation_status["error"] = None
221
 
222
- # Load chunks
223
- with open("./chunks.json", "r", encoding="utf-8") as f:
224
- chunks_data = json.load(f)
225
 
226
- if chunks_data is None:
227
- raise HTTPException(status_code=404, detail="Chunks file not found in any known location")
228
 
229
- VACCINE_CHUNKS = [chunk["text"] for chunk in chunks_data]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
  # Start generation in background thread
232
- thread = threading.Thread(target=generate_questions_in_background, args=(VACCINE_CHUNKS,))
233
  thread.daemon = True
234
  thread.start()
235
 
236
  return {
237
  "status": "started",
238
- "message": "Question generation started in background",
239
  "current_status": generation_status
240
  }
241
  except Exception as e:
 
9
  from huggingface_hub import HfApi # For file persistence in Spaces
10
  import os
11
  import threading
12
+ import glob
13
 
14
  # Load environment variables from .env file
15
  load_dotenv()
 
196
  @app.get("/generate-questions")
197
  async def generate_questions():
198
  """
199
+ Endpoint to generate questions from all JSON files in the data folder
200
  """
201
  global generation_status
202
 
 
220
  generation_status["result_file"] = None
221
  generation_status["error"] = None
222
 
223
+ # Load all JSON files from data folder
224
+ json_files = glob.glob("./data/*.json")
 
225
 
226
+ if not json_files:
227
+ raise HTTPException(status_code=404, detail="No JSON files found in data folder")
228
 
229
+ all_chunks = []
230
+ for json_file in json_files:
231
+ with open(json_file, "r", encoding="utf-8") as f:
232
+ chunks_data = json.load(f)
233
+ if isinstance(chunks_data, list):
234
+ # If it's a list of chunks
235
+ for chunk in chunks_data:
236
+ if isinstance(chunk, dict) and "text" in chunk:
237
+ all_chunks.append(chunk["text"])
238
+ elif isinstance(chunk, str):
239
+ all_chunks.append(chunk)
240
+ elif isinstance(chunks_data, dict):
241
+ # If it's a dict, try to extract text content
242
+ if "text" in chunks_data:
243
+ all_chunks.append(chunks_data["text"])
244
+ elif "content" in chunks_data:
245
+ all_chunks.append(chunks_data["content"])
246
+
247
+ if not all_chunks:
248
+ raise HTTPException(status_code=404, detail="No text content found in JSON files")
249
 
250
  # Start generation in background thread
251
+ thread = threading.Thread(target=generate_questions_in_background, args=(all_chunks,))
252
  thread.daemon = True
253
  thread.start()
254
 
255
  return {
256
  "status": "started",
257
+ "message": f"Question generation started for {len(json_files)} JSON files with {len(all_chunks)} chunks",
258
  "current_status": generation_status
259
  }
260
  except Exception as e:
chunk/chunks_WER8624_241_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER8647_521_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER8827_269_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER8839_413_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER8925_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9008_57_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9009_69_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9018_185_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9035_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9039_505_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9206_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9217_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9225_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9227_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9231_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9234_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9308_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9313_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9316_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9527_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9623_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9719_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9720_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9725_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9740_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9750_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_WER9901_02_unstructured_api_output.json ADDED
The diff for this file is too large to render. See raw diff
 
chunk/chunks_With_bboxes_elements_Immunization_unstructured_api_output_mr_.json ADDED
The diff for this file is too large to render. See raw diff
 
chunks.json → chunk/chunks_filtered_elements_with_bboxes.json RENAMED
The diff for this file is too large to render. See raw diff