TilanB commited on
Commit
ca6762d
·
verified ·
1 Parent(s): 9d7baa4

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +73 -34
main.py CHANGED
@@ -942,11 +942,16 @@ setInterval(tick, 500);
942
  first_row = ds[0]
943
  pdf_data = first_row.get('pdf', None)
944
  logger.info(f"Dataset first row 'pdf' type: {type(pdf_data)}")
945
- if isinstance(pdf_data, dict):
 
 
 
 
 
946
  logger.info(f"PDF dict keys: {list(pdf_data.keys())}")
947
  if 'path' in pdf_data:
948
- logger.info(f"PDF path example: {pdf_data.get('path', 'N/A')}")
949
-
950
  # Extract requested files from dataset rows
951
  for file_path in file_names:
952
  filename = os.path.basename(file_path)
@@ -963,14 +968,27 @@ setInterval(tick, 500);
963
  continue
964
 
965
  # Extract the actual filename from the pdf data
966
- # HF datasets library returns file objects as dicts with 'path' key
967
- if isinstance(pdf_data, dict):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
968
  row_filename = pdf_data.get('path', '')
 
969
  elif isinstance(pdf_data, str):
970
  row_filename = pdf_data
971
- else:
972
- # Try to get path attribute (for other formats)
973
- row_filename = getattr(pdf_data, 'path', '') or str(pdf_data)
974
 
975
  row_basename = os.path.basename(str(row_filename))
976
  logger.debug(f"Row {row_idx}: checking '{row_basename}' vs '{filename}'")
@@ -981,51 +999,72 @@ setInterval(tick, 500);
981
  logger.info(f"Found match! Extracting {filename}...")
982
 
983
  try:
984
- # Handle different data formats from HF datasets
985
- if isinstance(pdf_data, dict):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
986
  if 'bytes' in pdf_data and pdf_data['bytes']:
987
- # Most common: dict with 'bytes' key
988
  with open(temp_file_path, 'wb') as f:
989
  f.write(pdf_data['bytes'])
990
- logger.info(f"Wrote {len(pdf_data['bytes'])} bytes to {temp_file_path}")
 
991
  elif 'path' in pdf_data and pdf_data['path'] and os.path.exists(pdf_data['path']):
992
- # File path exists on disk (HF caches files)
993
  shutil.copy2(pdf_data['path'], temp_file_path)
994
- logger.info(f"Copied from cache: {pdf_data['path']}")
995
- else:
996
- logger.error(f"Dict has no usable data: {list(pdf_data.keys())}")
997
- continue
998
  elif isinstance(pdf_data, bytes):
999
- # Direct bytes
1000
  with open(temp_file_path, 'wb') as f:
1001
  f.write(pdf_data)
 
 
 
1002
  elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
1003
- # File path string
1004
  shutil.copy2(pdf_data, temp_file_path)
1005
- else:
1006
- logger.error(f"Unknown PDF data type: {type(pdf_data)}")
1007
- continue
1008
 
1009
- copied_files.append(temp_file_path)
1010
- file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
1011
- file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
1012
- file_found = True
1013
- logger.info(f"✅ Successfully extracted {filename}")
1014
- break
 
 
 
 
1015
  except Exception as ex:
1016
  logger.error(f"Failed to extract {filename}: {ex}", exc_info=True)
1017
  continue
1018
 
1019
  if not file_found:
1020
- # Debug: show what's actually in the dataset
1021
  logger.warning(f"❌ File {filename} not found in dataset rows")
 
1022
  for idx, row in enumerate(ds):
1023
  pdf_data = row.get('pdf', None)
1024
- if pdf_data:
1025
- if isinstance(pdf_data, dict):
1026
- available_name = os.path.basename(str(pdf_data.get('path', 'unknown')))
1027
- else:
1028
- available_name = str(type(pdf_data))
1029
  logger.info(f" Available file in row {idx}: '{available_name}'")
1030
  file_info_text += f"⚠️ {filename} - Not found in dataset\n"
1031
 
 
942
  first_row = ds[0]
943
  pdf_data = first_row.get('pdf', None)
944
  logger.info(f"Dataset first row 'pdf' type: {type(pdf_data)}")
945
+
946
+ # Handle different types
947
+ if hasattr(pdf_data, 'stream') and hasattr(pdf_data.stream, 'name'):
948
+ # pdfplumber PDF object
949
+ logger.info(f"PDF is pdfplumber object, stream path: {pdf_data.stream.name}")
950
+ elif isinstance(pdf_data, dict):
951
  logger.info(f"PDF dict keys: {list(pdf_data.keys())}")
952
  if 'path' in pdf_data:
953
+ logger.info(f"PDF path: {pdf_data.get('path', 'N/A')}")
954
+
955
  # Extract requested files from dataset rows
956
  for file_path in file_names:
957
  filename = os.path.basename(file_path)
 
968
  continue
969
 
970
  # Extract the actual filename from the pdf data
971
+ # HF datasets with PDF files can return different types:
972
+ # 1. pdfplumber.pdf.PDF objects (when using pdf feature type)
973
+ # 2. dict with 'path' and 'bytes' keys
974
+ # 3. str path
975
+ # 4. bytes directly
976
+
977
+ row_filename = ""
978
+
979
+ # Check for pdfplumber PDF object (has .stream.name attribute)
980
+ if hasattr(pdf_data, 'stream') and hasattr(pdf_data.stream, 'name'):
981
+ row_filename = pdf_data.stream.name
982
+ logger.debug(f"Got filename from pdfplumber stream: {row_filename}")
983
+ # Check for pdfplumber PDF object with path attribute
984
+ elif hasattr(pdf_data, 'path'):
985
+ row_filename = pdf_data.path
986
+ # Check for dict format
987
+ elif isinstance(pdf_data, dict):
988
  row_filename = pdf_data.get('path', '')
989
+ # Check for string path
990
  elif isinstance(pdf_data, str):
991
  row_filename = pdf_data
 
 
 
992
 
993
  row_basename = os.path.basename(str(row_filename))
994
  logger.debug(f"Row {row_idx}: checking '{row_basename}' vs '{filename}'")
 
999
  logger.info(f"Found match! Extracting {filename}...")
1000
 
1001
  try:
1002
+ extracted = False
1003
+
1004
+ # Handle pdfplumber PDF object
1005
+ if hasattr(pdf_data, 'stream'):
1006
+ # Get the file path from pdfplumber's stream
1007
+ source_path = pdf_data.stream.name
1008
+ if source_path and os.path.exists(source_path):
1009
+ shutil.copy2(source_path, temp_file_path)
1010
+ logger.info(f"Copied from pdfplumber stream: {source_path}")
1011
+ extracted = True
1012
+ else:
1013
+ # Try to read bytes from stream
1014
+ try:
1015
+ pdf_data.stream.seek(0)
1016
+ pdf_bytes = pdf_data.stream.read()
1017
+ with open(temp_file_path, 'wb') as f:
1018
+ f.write(pdf_bytes)
1019
+ logger.info(f"Wrote {len(pdf_bytes)} bytes from pdfplumber stream")
1020
+ extracted = True
1021
+ except Exception as stream_err:
1022
+ logger.warning(f"Could not read stream: {stream_err}")
1023
+
1024
+ # Handle dict format
1025
+ elif isinstance(pdf_data, dict):
1026
  if 'bytes' in pdf_data and pdf_data['bytes']:
 
1027
  with open(temp_file_path, 'wb') as f:
1028
  f.write(pdf_data['bytes'])
1029
+ logger.info(f"Wrote {len(pdf_data['bytes'])} bytes")
1030
+ extracted = True
1031
  elif 'path' in pdf_data and pdf_data['path'] and os.path.exists(pdf_data['path']):
 
1032
  shutil.copy2(pdf_data['path'], temp_file_path)
1033
+ logger.info(f"Copied from dict path: {pdf_data['path']}")
1034
+ extracted = True
1035
+
1036
+ # Handle bytes directly
1037
  elif isinstance(pdf_data, bytes):
 
1038
  with open(temp_file_path, 'wb') as f:
1039
  f.write(pdf_data)
1040
+ extracted = True
1041
+
1042
+ # Handle string path
1043
  elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
 
1044
  shutil.copy2(pdf_data, temp_file_path)
1045
+ extracted = True
 
 
1046
 
1047
+ if extracted and os.path.exists(temp_file_path):
1048
+ copied_files.append(temp_file_path)
1049
+ file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
1050
+ file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
1051
+ file_found = True
1052
+ logger.info(f"✅ Successfully extracted {filename}")
1053
+ break
1054
+ else:
1055
+ logger.error(f"Could not extract file: {type(pdf_data)}")
1056
+
1057
  except Exception as ex:
1058
  logger.error(f"Failed to extract {filename}: {ex}", exc_info=True)
1059
  continue
1060
 
1061
  if not file_found:
 
1062
  logger.warning(f"❌ File {filename} not found in dataset rows")
1063
+ # Debug: show what's available
1064
  for idx, row in enumerate(ds):
1065
  pdf_data = row.get('pdf', None)
1066
+ if pdf_data and hasattr(pdf_data, 'stream') and hasattr(pdf_data.stream, 'name'):
1067
+ available_name = os.path.basename(str(pdf_data.stream.name))
 
 
 
1068
  logger.info(f" Available file in row {idx}: '{available_name}'")
1069
  file_info_text += f"⚠️ {filename} - Not found in dataset\n"
1070