Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -942,11 +942,16 @@ setInterval(tick, 500);
|
|
| 942 |
first_row = ds[0]
|
| 943 |
pdf_data = first_row.get('pdf', None)
|
| 944 |
logger.info(f"Dataset first row 'pdf' type: {type(pdf_data)}")
|
| 945 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 946 |
logger.info(f"PDF dict keys: {list(pdf_data.keys())}")
|
| 947 |
if 'path' in pdf_data:
|
| 948 |
-
logger.info(f"PDF path
|
| 949 |
-
|
| 950 |
# Extract requested files from dataset rows
|
| 951 |
for file_path in file_names:
|
| 952 |
filename = os.path.basename(file_path)
|
|
@@ -963,14 +968,27 @@ setInterval(tick, 500);
|
|
| 963 |
continue
|
| 964 |
|
| 965 |
# Extract the actual filename from the pdf data
|
| 966 |
-
# HF datasets
|
| 967 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 968 |
row_filename = pdf_data.get('path', '')
|
|
|
|
| 969 |
elif isinstance(pdf_data, str):
|
| 970 |
row_filename = pdf_data
|
| 971 |
-
else:
|
| 972 |
-
# Try to get path attribute (for other formats)
|
| 973 |
-
row_filename = getattr(pdf_data, 'path', '') or str(pdf_data)
|
| 974 |
|
| 975 |
row_basename = os.path.basename(str(row_filename))
|
| 976 |
logger.debug(f"Row {row_idx}: checking '{row_basename}' vs '{filename}'")
|
|
@@ -981,51 +999,72 @@ setInterval(tick, 500);
|
|
| 981 |
logger.info(f"Found match! Extracting {filename}...")
|
| 982 |
|
| 983 |
try:
|
| 984 |
-
|
| 985 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 986 |
if 'bytes' in pdf_data and pdf_data['bytes']:
|
| 987 |
-
# Most common: dict with 'bytes' key
|
| 988 |
with open(temp_file_path, 'wb') as f:
|
| 989 |
f.write(pdf_data['bytes'])
|
| 990 |
-
logger.info(f"Wrote {len(pdf_data['bytes'])} bytes
|
|
|
|
| 991 |
elif 'path' in pdf_data and pdf_data['path'] and os.path.exists(pdf_data['path']):
|
| 992 |
-
# File path exists on disk (HF caches files)
|
| 993 |
shutil.copy2(pdf_data['path'], temp_file_path)
|
| 994 |
-
logger.info(f"Copied from
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
elif isinstance(pdf_data, bytes):
|
| 999 |
-
# Direct bytes
|
| 1000 |
with open(temp_file_path, 'wb') as f:
|
| 1001 |
f.write(pdf_data)
|
|
|
|
|
|
|
|
|
|
| 1002 |
elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
|
| 1003 |
-
# File path string
|
| 1004 |
shutil.copy2(pdf_data, temp_file_path)
|
| 1005 |
-
|
| 1006 |
-
logger.error(f"Unknown PDF data type: {type(pdf_data)}")
|
| 1007 |
-
continue
|
| 1008 |
|
| 1009 |
-
|
| 1010 |
-
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
-
|
| 1014 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1015 |
except Exception as ex:
|
| 1016 |
logger.error(f"Failed to extract {filename}: {ex}", exc_info=True)
|
| 1017 |
continue
|
| 1018 |
|
| 1019 |
if not file_found:
|
| 1020 |
-
# Debug: show what's actually in the dataset
|
| 1021 |
logger.warning(f"❌ File {filename} not found in dataset rows")
|
|
|
|
| 1022 |
for idx, row in enumerate(ds):
|
| 1023 |
pdf_data = row.get('pdf', None)
|
| 1024 |
-
if pdf_data:
|
| 1025 |
-
|
| 1026 |
-
available_name = os.path.basename(str(pdf_data.get('path', 'unknown')))
|
| 1027 |
-
else:
|
| 1028 |
-
available_name = str(type(pdf_data))
|
| 1029 |
logger.info(f" Available file in row {idx}: '{available_name}'")
|
| 1030 |
file_info_text += f"⚠️ {filename} - Not found in dataset\n"
|
| 1031 |
|
|
|
|
| 942 |
first_row = ds[0]
|
| 943 |
pdf_data = first_row.get('pdf', None)
|
| 944 |
logger.info(f"Dataset first row 'pdf' type: {type(pdf_data)}")
|
| 945 |
+
|
| 946 |
+
# Handle different types
|
| 947 |
+
if hasattr(pdf_data, 'stream') and hasattr(pdf_data.stream, 'name'):
|
| 948 |
+
# pdfplumber PDF object
|
| 949 |
+
logger.info(f"PDF is pdfplumber object, stream path: {pdf_data.stream.name}")
|
| 950 |
+
elif isinstance(pdf_data, dict):
|
| 951 |
logger.info(f"PDF dict keys: {list(pdf_data.keys())}")
|
| 952 |
if 'path' in pdf_data:
|
| 953 |
+
logger.info(f"PDF path: {pdf_data.get('path', 'N/A')}")
|
| 954 |
+
|
| 955 |
# Extract requested files from dataset rows
|
| 956 |
for file_path in file_names:
|
| 957 |
filename = os.path.basename(file_path)
|
|
|
|
| 968 |
continue
|
| 969 |
|
| 970 |
# Extract the actual filename from the pdf data
|
| 971 |
+
# HF datasets with PDF files can return different types:
|
| 972 |
+
# 1. pdfplumber.pdf.PDF objects (when using pdf feature type)
|
| 973 |
+
# 2. dict with 'path' and 'bytes' keys
|
| 974 |
+
# 3. str path
|
| 975 |
+
# 4. bytes directly
|
| 976 |
+
|
| 977 |
+
row_filename = ""
|
| 978 |
+
|
| 979 |
+
# Check for pdfplumber PDF object (has .stream.name attribute)
|
| 980 |
+
if hasattr(pdf_data, 'stream') and hasattr(pdf_data.stream, 'name'):
|
| 981 |
+
row_filename = pdf_data.stream.name
|
| 982 |
+
logger.debug(f"Got filename from pdfplumber stream: {row_filename}")
|
| 983 |
+
# Check for pdfplumber PDF object with path attribute
|
| 984 |
+
elif hasattr(pdf_data, 'path'):
|
| 985 |
+
row_filename = pdf_data.path
|
| 986 |
+
# Check for dict format
|
| 987 |
+
elif isinstance(pdf_data, dict):
|
| 988 |
row_filename = pdf_data.get('path', '')
|
| 989 |
+
# Check for string path
|
| 990 |
elif isinstance(pdf_data, str):
|
| 991 |
row_filename = pdf_data
|
|
|
|
|
|
|
|
|
|
| 992 |
|
| 993 |
row_basename = os.path.basename(str(row_filename))
|
| 994 |
logger.debug(f"Row {row_idx}: checking '{row_basename}' vs '{filename}'")
|
|
|
|
| 999 |
logger.info(f"Found match! Extracting {filename}...")
|
| 1000 |
|
| 1001 |
try:
|
| 1002 |
+
extracted = False
|
| 1003 |
+
|
| 1004 |
+
# Handle pdfplumber PDF object
|
| 1005 |
+
if hasattr(pdf_data, 'stream'):
|
| 1006 |
+
# Get the file path from pdfplumber's stream
|
| 1007 |
+
source_path = pdf_data.stream.name
|
| 1008 |
+
if source_path and os.path.exists(source_path):
|
| 1009 |
+
shutil.copy2(source_path, temp_file_path)
|
| 1010 |
+
logger.info(f"Copied from pdfplumber stream: {source_path}")
|
| 1011 |
+
extracted = True
|
| 1012 |
+
else:
|
| 1013 |
+
# Try to read bytes from stream
|
| 1014 |
+
try:
|
| 1015 |
+
pdf_data.stream.seek(0)
|
| 1016 |
+
pdf_bytes = pdf_data.stream.read()
|
| 1017 |
+
with open(temp_file_path, 'wb') as f:
|
| 1018 |
+
f.write(pdf_bytes)
|
| 1019 |
+
logger.info(f"Wrote {len(pdf_bytes)} bytes from pdfplumber stream")
|
| 1020 |
+
extracted = True
|
| 1021 |
+
except Exception as stream_err:
|
| 1022 |
+
logger.warning(f"Could not read stream: {stream_err}")
|
| 1023 |
+
|
| 1024 |
+
# Handle dict format
|
| 1025 |
+
elif isinstance(pdf_data, dict):
|
| 1026 |
if 'bytes' in pdf_data and pdf_data['bytes']:
|
|
|
|
| 1027 |
with open(temp_file_path, 'wb') as f:
|
| 1028 |
f.write(pdf_data['bytes'])
|
| 1029 |
+
logger.info(f"Wrote {len(pdf_data['bytes'])} bytes")
|
| 1030 |
+
extracted = True
|
| 1031 |
elif 'path' in pdf_data and pdf_data['path'] and os.path.exists(pdf_data['path']):
|
|
|
|
| 1032 |
shutil.copy2(pdf_data['path'], temp_file_path)
|
| 1033 |
+
logger.info(f"Copied from dict path: {pdf_data['path']}")
|
| 1034 |
+
extracted = True
|
| 1035 |
+
|
| 1036 |
+
# Handle bytes directly
|
| 1037 |
elif isinstance(pdf_data, bytes):
|
|
|
|
| 1038 |
with open(temp_file_path, 'wb') as f:
|
| 1039 |
f.write(pdf_data)
|
| 1040 |
+
extracted = True
|
| 1041 |
+
|
| 1042 |
+
# Handle string path
|
| 1043 |
elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
|
|
|
|
| 1044 |
shutil.copy2(pdf_data, temp_file_path)
|
| 1045 |
+
extracted = True
|
|
|
|
|
|
|
| 1046 |
|
| 1047 |
+
if extracted and os.path.exists(temp_file_path):
|
| 1048 |
+
copied_files.append(temp_file_path)
|
| 1049 |
+
file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
|
| 1050 |
+
file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
|
| 1051 |
+
file_found = True
|
| 1052 |
+
logger.info(f"✅ Successfully extracted {filename}")
|
| 1053 |
+
break
|
| 1054 |
+
else:
|
| 1055 |
+
logger.error(f"Could not extract file: {type(pdf_data)}")
|
| 1056 |
+
|
| 1057 |
except Exception as ex:
|
| 1058 |
logger.error(f"Failed to extract {filename}: {ex}", exc_info=True)
|
| 1059 |
continue
|
| 1060 |
|
| 1061 |
if not file_found:
|
|
|
|
| 1062 |
logger.warning(f"❌ File {filename} not found in dataset rows")
|
| 1063 |
+
# Debug: show what's available
|
| 1064 |
for idx, row in enumerate(ds):
|
| 1065 |
pdf_data = row.get('pdf', None)
|
| 1066 |
+
if pdf_data and hasattr(pdf_data, 'stream') and hasattr(pdf_data.stream, 'name'):
|
| 1067 |
+
available_name = os.path.basename(str(pdf_data.stream.name))
|
|
|
|
|
|
|
|
|
|
| 1068 |
logger.info(f" Available file in row {idx}: '{available_name}'")
|
| 1069 |
file_info_text += f"⚠️ {filename} - Not found in dataset\n"
|
| 1070 |
|