Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -945,41 +945,56 @@ setInterval(tick, 500);
|
|
| 945 |
# Search through dataset rows
|
| 946 |
for row in ds:
|
| 947 |
# Check if this row contains our file
|
| 948 |
-
#
|
| 949 |
-
row_filename = row.get('
|
| 950 |
|
| 951 |
-
|
|
|
|
| 952 |
temp_file_path = os.path.join(temp_dir, filename)
|
| 953 |
|
| 954 |
-
#
|
| 955 |
-
|
| 956 |
-
|
| 957 |
-
f.write(row['content'])
|
| 958 |
-
elif 'file' in row and row['file']:
|
| 959 |
-
file_obj = row['file']
|
| 960 |
-
if isinstance(file_obj, dict) and 'bytes' in file_obj:
|
| 961 |
-
with open(temp_file_path, 'wb') as f:
|
| 962 |
-
f.write(file_obj['bytes'])
|
| 963 |
-
elif isinstance(file_obj, bytes):
|
| 964 |
-
with open(temp_file_path, 'wb') as f:
|
| 965 |
-
f.write(file_obj)
|
| 966 |
-
elif 'data' in row and row['data']:
|
| 967 |
-
with open(temp_file_path, 'wb') as f:
|
| 968 |
-
f.write(row['data'])
|
| 969 |
-
else:
|
| 970 |
-
logger.warning(f"Unknown dataset format for {filename}, fields: {list(row.keys())}")
|
| 971 |
-
continue
|
| 972 |
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
-
|
| 977 |
-
|
| 978 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 979 |
|
| 980 |
if not file_found:
|
| 981 |
logger.warning(f"File {filename} not found in dataset rows")
|
| 982 |
file_info_text += f"⚠️ {filename} - Not found in dataset\n"
|
|
|
|
|
|
|
|
|
|
| 983 |
|
| 984 |
if not copied_files:
|
| 985 |
if len(ds) > 0:
|
|
|
|
| 945 |
# Search through dataset rows
|
| 946 |
for row in ds:
|
| 947 |
# Check if this row contains our file
|
| 948 |
+
# The dataset has a 'pdf' column with file paths
|
| 949 |
+
row_filename = row.get('pdf', '')
|
| 950 |
|
| 951 |
+
# Match by filename (the PDF column stores filenames)
|
| 952 |
+
if isinstance(row_filename, str) and os.path.basename(row_filename) == filename:
|
| 953 |
temp_file_path = os.path.join(temp_dir, filename)
|
| 954 |
|
| 955 |
+
# The 'pdf' column contains the actual file path/data
|
| 956 |
+
# Datasets library auto-loads files from the 'pdf' column
|
| 957 |
+
pdf_data = row.get('pdf')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 958 |
|
| 959 |
+
if pdf_data:
|
| 960 |
+
try:
|
| 961 |
+
# Check if it's already bytes
|
| 962 |
+
if isinstance(pdf_data, bytes):
|
| 963 |
+
with open(temp_file_path, 'wb') as f:
|
| 964 |
+
f.write(pdf_data)
|
| 965 |
+
# Check if it's a dict with 'bytes' key (common format)
|
| 966 |
+
elif isinstance(pdf_data, dict):
|
| 967 |
+
if 'bytes' in pdf_data:
|
| 968 |
+
with open(temp_file_path, 'wb') as f:
|
| 969 |
+
f.write(pdf_data['bytes'])
|
| 970 |
+
elif 'path' in pdf_data:
|
| 971 |
+
# It's a file path, copy the file
|
| 972 |
+
import shutil
|
| 973 |
+
shutil.copy2(pdf_data['path'], temp_file_path)
|
| 974 |
+
# Try to read as file path string
|
| 975 |
+
elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
|
| 976 |
+
import shutil
|
| 977 |
+
shutil.copy2(pdf_data, temp_file_path)
|
| 978 |
+
else:
|
| 979 |
+
logger.error(f"Unknown PDF data format: {type(pdf_data)}")
|
| 980 |
+
continue
|
| 981 |
+
|
| 982 |
+
copied_files.append(temp_file_path)
|
| 983 |
+
file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
|
| 984 |
+
file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
|
| 985 |
+
file_found = True
|
| 986 |
+
logger.info(f"Successfully extracted {filename} from dataset")
|
| 987 |
+
break
|
| 988 |
+
except Exception as ex:
|
| 989 |
+
logger.error(f"Failed to extract {filename}: {ex}")
|
| 990 |
+
continue
|
| 991 |
|
| 992 |
if not file_found:
|
| 993 |
logger.warning(f"File {filename} not found in dataset rows")
|
| 994 |
file_info_text += f"⚠️ {filename} - Not found in dataset\n"
|
| 995 |
+
# Debug: print available filenames
|
| 996 |
+
available = [row.get('pdf', 'N/A') for row in ds]
|
| 997 |
+
logger.debug(f"Available files in dataset: {available}")
|
| 998 |
|
| 999 |
if not copied_files:
|
| 1000 |
if len(ds) > 0:
|