TilanB commited on
Commit
94d209d
·
verified ·
1 Parent(s): c6f51a5

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +42 -27
main.py CHANGED
@@ -945,41 +945,56 @@ setInterval(tick, 500);
945
  # Search through dataset rows
946
  for row in ds:
947
  # Check if this row contains our file
948
- # Adjust field names based on your dataset structure
949
- row_filename = row.get('filename') or row.get('name') or row.get('path', '')
950
 
951
- if os.path.basename(str(row_filename)) == filename:
 
952
  temp_file_path = os.path.join(temp_dir, filename)
953
 
954
- # Handle different dataset column formats
955
- if 'content' in row and row['content']:
956
- with open(temp_file_path, 'wb') as f:
957
- f.write(row['content'])
958
- elif 'file' in row and row['file']:
959
- file_obj = row['file']
960
- if isinstance(file_obj, dict) and 'bytes' in file_obj:
961
- with open(temp_file_path, 'wb') as f:
962
- f.write(file_obj['bytes'])
963
- elif isinstance(file_obj, bytes):
964
- with open(temp_file_path, 'wb') as f:
965
- f.write(file_obj)
966
- elif 'data' in row and row['data']:
967
- with open(temp_file_path, 'wb') as f:
968
- f.write(row['data'])
969
- else:
970
- logger.warning(f"Unknown dataset format for {filename}, fields: {list(row.keys())}")
971
- continue
972
 
973
- copied_files.append(temp_file_path)
974
- file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
975
- file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
976
- file_found = True
977
- logger.info(f"Successfully extracted {filename} from dataset")
978
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979
 
980
  if not file_found:
981
  logger.warning(f"File {filename} not found in dataset rows")
982
  file_info_text += f"⚠️ {filename} - Not found in dataset\n"
 
 
 
983
 
984
  if not copied_files:
985
  if len(ds) > 0:
 
945
  # Search through dataset rows
946
  for row in ds:
947
  # Check if this row contains our file
948
+ # The dataset has a 'pdf' column with file paths
949
+ row_filename = row.get('pdf', '')
950
 
951
+ # Match by filename (the PDF column stores filenames)
952
+ if isinstance(row_filename, str) and os.path.basename(row_filename) == filename:
953
  temp_file_path = os.path.join(temp_dir, filename)
954
 
955
+ # The 'pdf' column contains the actual file path/data
956
+ # Datasets library auto-loads files from the 'pdf' column
957
+ pdf_data = row.get('pdf')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
958
 
959
+ if pdf_data:
960
+ try:
961
+ # Check if it's already bytes
962
+ if isinstance(pdf_data, bytes):
963
+ with open(temp_file_path, 'wb') as f:
964
+ f.write(pdf_data)
965
+ # Check if it's a dict with 'bytes' key (common format)
966
+ elif isinstance(pdf_data, dict):
967
+ if 'bytes' in pdf_data:
968
+ with open(temp_file_path, 'wb') as f:
969
+ f.write(pdf_data['bytes'])
970
+ elif 'path' in pdf_data:
971
+ # It's a file path, copy the file
972
+ import shutil
973
+ shutil.copy2(pdf_data['path'], temp_file_path)
974
+ # Try to read as file path string
975
+ elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
976
+ import shutil
977
+ shutil.copy2(pdf_data, temp_file_path)
978
+ else:
979
+ logger.error(f"Unknown PDF data format: {type(pdf_data)}")
980
+ continue
981
+
982
+ copied_files.append(temp_file_path)
983
+ file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
984
+ file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
985
+ file_found = True
986
+ logger.info(f"Successfully extracted {filename} from dataset")
987
+ break
988
+ except Exception as ex:
989
+ logger.error(f"Failed to extract {filename}: {ex}")
990
+ continue
991
 
992
  if not file_found:
993
  logger.warning(f"File {filename} not found in dataset rows")
994
  file_info_text += f"⚠️ {filename} - Not found in dataset\n"
995
+ # Debug: print available filenames
996
+ available = [row.get('pdf', 'N/A') for row in ds]
997
+ logger.debug(f"Available files in dataset: {available}")
998
 
999
  if not copied_files:
1000
  if len(ds) > 0: