TilanB commited on
Commit
9d7baa4
·
verified ·
1 Parent(s): 80f44a7

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +78 -45
main.py CHANGED
@@ -937,65 +937,98 @@ setInterval(tick, 500);
937
  # Create temp directory for files
938
  temp_dir = tempfile.mkdtemp(prefix='hf_examples_')
939
 
 
 
 
 
 
 
 
 
 
 
940
  # Extract requested files from dataset rows
941
  for file_path in file_names:
942
  filename = os.path.basename(file_path)
943
  file_found = False
944
 
 
 
945
  # Search through dataset rows
946
- for row in ds:
947
- # Check if this row contains our file
948
- # The dataset has a 'pdf' column with file paths
949
- row_filename = row.get('pdf', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
950
 
951
- # Match by filename (the PDF column stores filenames)
952
- if isinstance(row_filename, str) and os.path.basename(row_filename) == filename:
953
  temp_file_path = os.path.join(temp_dir, filename)
 
954
 
955
- # The 'pdf' column contains the actual file path/data
956
- # Datasets library auto-loads files from the 'pdf' column
957
- pdf_data = row.get('pdf')
958
-
959
- if pdf_data:
960
- try:
961
- # Check if it's already bytes
962
- if isinstance(pdf_data, bytes):
963
  with open(temp_file_path, 'wb') as f:
964
- f.write(pdf_data)
965
- # Check if it's a dict with 'bytes' key (common format)
966
- elif isinstance(pdf_data, dict):
967
- if 'bytes' in pdf_data:
968
- with open(temp_file_path, 'wb') as f:
969
- f.write(pdf_data['bytes'])
970
- elif 'path' in pdf_data:
971
- # It's a file path, copy the file
972
- import shutil
973
- shutil.copy2(pdf_data['path'], temp_file_path)
974
- # Try to read as file path string
975
- elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
976
- import shutil
977
- shutil.copy2(pdf_data, temp_file_path)
978
  else:
979
- logger.error(f"Unknown PDF data format: {type(pdf_data)}")
980
  continue
981
-
982
- copied_files.append(temp_file_path)
983
- file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
984
- file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
985
- file_found = True
986
- logger.info(f"Successfully extracted {filename} from dataset")
987
- break
988
- except Exception as ex:
989
- logger.error(f"Failed to extract {filename}: {ex}")
990
  continue
 
 
 
 
 
 
 
 
 
 
991
 
992
  if not file_found:
993
- logger.warning(f"File {filename} not found in dataset rows")
 
 
 
 
 
 
 
 
 
994
  file_info_text += f"⚠️ {filename} - Not found in dataset\n"
995
- # Debug: print available filenames
996
- available = [row.get('pdf', 'N/A') for row in ds]
997
- logger.debug(f"Available files in dataset: {available}")
998
-
999
  if not copied_files:
1000
  if len(ds) > 0:
1001
  logger.error(f"Dataset structure: {list(ds[0].keys())}")
@@ -1069,14 +1102,14 @@ setInterval(tick, 500);
1069
  if is_hf_space:
1070
  # Hugging Face Spaces configuration
1071
  logger.info("Running on Hugging Face Spaces")
1072
- demo.launch(theme=gr.themes.Soft(),server_name="0.0.0.0", server_port=7860, css=css, js=js)
1073
  else:
1074
  # Local development configuration
1075
  configured_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
1076
  server_port = _find_open_port(configured_port)
1077
  logger.info(f"Launching Gradio on port {server_port}")
1078
  logger.info(f"Access the app at: http://127.0.0.1:{server_port}")
1079
- demo.launch(theme=gr.themes.Soft(),server_port=server_port, share=False, css=css, js=js)
1080
 
1081
 
1082
  if __name__ == "__main__":
 
937
  # Create temp directory for files
938
  temp_dir = tempfile.mkdtemp(prefix='hf_examples_')
939
 
940
+ # Debug: Log first row structure
941
+ if len(ds) > 0:
942
+ first_row = ds[0]
943
+ pdf_data = first_row.get('pdf', None)
944
+ logger.info(f"Dataset first row 'pdf' type: {type(pdf_data)}")
945
+ if isinstance(pdf_data, dict):
946
+ logger.info(f"PDF dict keys: {list(pdf_data.keys())}")
947
+ if 'path' in pdf_data:
948
+ logger.info(f"PDF path example: {pdf_data.get('path', 'N/A')}")
949
+
950
  # Extract requested files from dataset rows
951
  for file_path in file_names:
952
  filename = os.path.basename(file_path)
953
  file_found = False
954
 
955
+ logger.info(f"Looking for file: {filename}")
956
+
957
  # Search through dataset rows
958
+ for row_idx, row in enumerate(ds):
959
+ # The 'pdf' column contains file objects from HF datasets
960
+ pdf_data = row.get('pdf', None)
961
+
962
+ if pdf_data is None:
963
+ continue
964
+
965
+ # Extract the actual filename from the pdf data
966
+ # HF datasets library returns file objects as dicts with 'path' key
967
+ if isinstance(pdf_data, dict):
968
+ row_filename = pdf_data.get('path', '')
969
+ elif isinstance(pdf_data, str):
970
+ row_filename = pdf_data
971
+ else:
972
+ # Try to get path attribute (for other formats)
973
+ row_filename = getattr(pdf_data, 'path', '') or str(pdf_data)
974
+
975
+ row_basename = os.path.basename(str(row_filename))
976
+ logger.debug(f"Row {row_idx}: checking '{row_basename}' vs '{filename}'")
977
 
978
+ # Match by filename
979
+ if row_basename == filename:
980
  temp_file_path = os.path.join(temp_dir, filename)
981
+ logger.info(f"Found match! Extracting {filename}...")
982
 
983
+ try:
984
+ # Handle different data formats from HF datasets
985
+ if isinstance(pdf_data, dict):
986
+ if 'bytes' in pdf_data and pdf_data['bytes']:
987
+ # Most common: dict with 'bytes' key
 
 
 
988
  with open(temp_file_path, 'wb') as f:
989
+ f.write(pdf_data['bytes'])
990
+ logger.info(f"Wrote {len(pdf_data['bytes'])} bytes to {temp_file_path}")
991
+ elif 'path' in pdf_data and pdf_data['path'] and os.path.exists(pdf_data['path']):
992
+ # File path exists on disk (HF caches files)
993
+ shutil.copy2(pdf_data['path'], temp_file_path)
994
+ logger.info(f"Copied from cache: {pdf_data['path']}")
 
 
 
 
 
 
 
 
995
  else:
996
+ logger.error(f"Dict has no usable data: {list(pdf_data.keys())}")
997
  continue
998
+ elif isinstance(pdf_data, bytes):
999
+ # Direct bytes
1000
+ with open(temp_file_path, 'wb') as f:
1001
+ f.write(pdf_data)
1002
+ elif isinstance(pdf_data, str) and os.path.exists(pdf_data):
1003
+ # File path string
1004
+ shutil.copy2(pdf_data, temp_file_path)
1005
+ else:
1006
+ logger.error(f"Unknown PDF data type: {type(pdf_data)}")
1007
  continue
1008
+
1009
+ copied_files.append(temp_file_path)
1010
+ file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
1011
+ file_info_text += f"📄 {filename} ({file_size_mb:.2f} MB)\n"
1012
+ file_found = True
1013
+ logger.info(f"✅ Successfully extracted {filename}")
1014
+ break
1015
+ except Exception as ex:
1016
+ logger.error(f"Failed to extract {filename}: {ex}", exc_info=True)
1017
+ continue
1018
 
1019
  if not file_found:
1020
+ # Debug: show what's actually in the dataset
1021
+ logger.warning(f"❌ File {filename} not found in dataset rows")
1022
+ for idx, row in enumerate(ds):
1023
+ pdf_data = row.get('pdf', None)
1024
+ if pdf_data:
1025
+ if isinstance(pdf_data, dict):
1026
+ available_name = os.path.basename(str(pdf_data.get('path', 'unknown')))
1027
+ else:
1028
+ available_name = str(type(pdf_data))
1029
+ logger.info(f" Available file in row {idx}: '{available_name}'")
1030
  file_info_text += f"⚠️ {filename} - Not found in dataset\n"
1031
+
 
 
 
1032
  if not copied_files:
1033
  if len(ds) > 0:
1034
  logger.error(f"Dataset structure: {list(ds[0].keys())}")
 
1102
  if is_hf_space:
1103
  # Hugging Face Spaces configuration
1104
  logger.info("Running on Hugging Face Spaces")
1105
+ demo.launch(theme=gr.themes.Soft(), server_name="0.0.0.0", server_port=7860, css=css, js=js)
1106
  else:
1107
  # Local development configuration
1108
  configured_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
1109
  server_port = _find_open_port(configured_port)
1110
  logger.info(f"Launching Gradio on port {server_port}")
1111
  logger.info(f"Access the app at: http://127.0.0.1:{server_port}")
1112
+ demo.launch(theme=gr.themes.Soft(), server_name="127.0.0.1", server_port=server_port, share=False, css=css, js=js)
1113
 
1114
 
1115
  if __name__ == "__main__":