TilanB commited on
Commit
c6f51a5
·
verified ·
1 Parent(s): c23a6c5

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +34 -9
main.py CHANGED
@@ -907,9 +907,23 @@ setInterval(tick, 500);
907
  copied_files = []
908
  file_info_text = f"✅ Loaded: {example_key}\n\n"
909
 
910
- # Get HF token (optional for public datasets)
911
  hf_token = os.environ.get("HF_TOKEN", None)
912
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
913
  try:
914
  # Load dataset - uses row-based structure
915
  logger.info(f"Loading dataset from HuggingFace: TilanB/smartdoc-samples")
@@ -934,16 +948,14 @@ setInterval(tick, 500);
934
  # Adjust field names based on your dataset structure
935
  row_filename = row.get('filename') or row.get('name') or row.get('path', '')
936
 
937
- if os.path.basename(row_filename) == filename:
938
  temp_file_path = os.path.join(temp_dir, filename)
939
 
940
  # Handle different dataset column formats
941
  if 'content' in row and row['content']:
942
- # Binary content stored directly
943
  with open(temp_file_path, 'wb') as f:
944
  f.write(row['content'])
945
  elif 'file' in row and row['file']:
946
- # File object with bytes
947
  file_obj = row['file']
948
  if isinstance(file_obj, dict) and 'bytes' in file_obj:
949
  with open(temp_file_path, 'wb') as f:
@@ -952,11 +964,10 @@ setInterval(tick, 500);
952
  with open(temp_file_path, 'wb') as f:
953
  f.write(file_obj)
954
  elif 'data' in row and row['data']:
955
- # Raw data field
956
  with open(temp_file_path, 'wb') as f:
957
  f.write(row['data'])
958
  else:
959
- logger.warning(f"Unknown dataset format for {filename}, available fields: {list(row.keys())}")
960
  continue
961
 
962
  copied_files.append(temp_file_path)
@@ -971,16 +982,30 @@ setInterval(tick, 500);
971
  file_info_text += f"⚠️ {filename} - Not found in dataset\n"
972
 
973
  if not copied_files:
974
- # Log dataset structure for debugging
975
  if len(ds) > 0:
976
  logger.error(f"Dataset structure: {list(ds[0].keys())}")
977
- return [], "", f"❌ Could not find example files in dataset.\n\nDataset has {len(ds)} rows. Please check dataset structure or upload files manually."
978
 
979
  return copied_files, question_text, file_info_text
980
 
981
  except Exception as e:
 
982
  logger.error(f"Failed to load dataset: {e}", exc_info=True)
983
- return [], "", f"❌ Failed to load dataset: {str(e)}\n\nPlease upload files manually."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
984
 
985
  except ImportError as e:
986
  logger.error(f"datasets package not installed: {e}")
 
907
  copied_files = []
908
  file_info_text = f"✅ Loaded: {example_key}\n\n"
909
 
910
+ # Get HF token - REQUIRED for gated datasets
911
  hf_token = os.environ.get("HF_TOKEN", None)
912
 
913
+ if not hf_token:
914
+ logger.warning("HF_TOKEN not set - required for gated datasets")
915
+ return [], "", (
916
+ "❌ **Authentication Required**\n\n"
917
+ "The example dataset is gated and requires authentication.\n\n"
918
+ "**To fix:**\n"
919
+ "1. Go to Space Settings → Repository secrets\n"
920
+ "2. Add secret: `HF_TOKEN` = your Hugging Face token\n"
921
+ "3. Restart the Space\n\n"
922
+ "Or make your dataset public at:\n"
923
+ "https://huggingface.co/datasets/TilanB/smartdoc-samples/settings\n\n"
924
+ "For now, please **upload files manually**."
925
+ )
926
+
927
  try:
928
  # Load dataset - uses row-based structure
929
  logger.info(f"Loading dataset from HuggingFace: TilanB/smartdoc-samples")
 
948
  # Adjust field names based on your dataset structure
949
  row_filename = row.get('filename') or row.get('name') or row.get('path', '')
950
 
951
+ if os.path.basename(str(row_filename)) == filename:
952
  temp_file_path = os.path.join(temp_dir, filename)
953
 
954
  # Handle different dataset column formats
955
  if 'content' in row and row['content']:
 
956
  with open(temp_file_path, 'wb') as f:
957
  f.write(row['content'])
958
  elif 'file' in row and row['file']:
 
959
  file_obj = row['file']
960
  if isinstance(file_obj, dict) and 'bytes' in file_obj:
961
  with open(temp_file_path, 'wb') as f:
 
964
  with open(temp_file_path, 'wb') as f:
965
  f.write(file_obj)
966
  elif 'data' in row and row['data']:
 
967
  with open(temp_file_path, 'wb') as f:
968
  f.write(row['data'])
969
  else:
970
+ logger.warning(f"Unknown dataset format for {filename}, fields: {list(row.keys())}")
971
  continue
972
 
973
  copied_files.append(temp_file_path)
 
982
  file_info_text += f"⚠️ {filename} - Not found in dataset\n"
983
 
984
  if not copied_files:
 
985
  if len(ds) > 0:
986
  logger.error(f"Dataset structure: {list(ds[0].keys())}")
987
+ return [], "", f"❌ Could not find example files in dataset.\n\nDataset has {len(ds)} rows. Please upload files manually."
988
 
989
  return copied_files, question_text, file_info_text
990
 
991
  except Exception as e:
992
+ error_msg = str(e)
993
  logger.error(f"Failed to load dataset: {e}", exc_info=True)
994
+
995
+ # Check for gated dataset error
996
+ if "gated" in error_msg.lower() or "authenticated" in error_msg.lower():
997
+ return [], "", (
998
+ "❌ **Dataset Access Denied**\n\n"
999
+ "The dataset is gated and your token doesn't have access.\n\n"
1000
+ "**To fix:**\n"
1001
+ "1. Visit: https://huggingface.co/datasets/TilanB/smartdoc-samples\n"
1002
+ "2. Accept the access terms (if any)\n"
1003
+ "3. Make sure HF_TOKEN is set in Space secrets\n\n"
1004
+ "Or make your dataset public.\n\n"
1005
+ "For now, please **upload files manually**."
1006
+ )
1007
+
1008
+ return [], "", f"❌ Failed to load dataset: {error_msg}\n\nPlease upload files manually."
1009
 
1010
  except ImportError as e:
1011
  logger.error(f"datasets package not installed: {e}")