JayBene1 commited on
Commit
c175eac
·
verified ·
1 Parent(s): 0bafe91

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -29
app.py CHANGED
@@ -257,47 +257,47 @@ def simulate_website_scraping(url):
257
  return contacts
258
 
259
  def parse_csv_file(file_content):
260
- """Parse CSV file and extract website URLs"""
261
  websites = []
262
  try:
263
  # Decode file content
264
  content = file_content.decode('utf-8')
265
 
266
- # Parse CSV
267
- csv_reader = csv.DictReader(io.StringIO(content))
 
268
 
269
- # Look for common website column names (case-insensitive)
270
- website_columns = ['website', 'url', 'domain', 'site', 'web', 'homepage']
271
 
272
- # Get all column names and print for debugging
273
- all_columns = list(csv_reader.fieldnames) if csv_reader.fieldnames else []
274
- print(f"CSV columns found: {all_columns}")
275
 
276
- # Find the website column (case-insensitive)
277
- website_column = None
278
- for col_name in all_columns:
279
- if col_name:
280
- # Clean and normalize column name for comparison
281
- clean_col_name = col_name.lower().strip()
282
- print(f"Checking column: '{col_name}' -> '{clean_col_name}'")
283
-
284
- if clean_col_name in website_columns:
285
- website_column = col_name # Use original column name
286
- print(f"Using website column: '{website_column}'")
287
- break
288
 
289
- if not website_column:
290
- print(f"No website column found. Available columns: {all_columns}")
291
- print(f"Looking for columns matching: {website_columns}")
292
  return []
293
 
294
- # Extract websites
295
- for row in csv_reader:
296
- website_url = row.get(website_column, '').strip()
297
- if website_url:
298
- websites.append(website_url)
 
 
 
 
 
 
 
 
 
299
 
300
- print(f"Extracted {len(websites)} websites: {websites[:5]}...") # Show first 5
301
  return websites
302
 
303
  except Exception as e:
 
257
  return contacts
258
 
259
  def parse_csv_file(file_content):
260
+ """Parse CSV file and extract website URLs from column H"""
261
  websites = []
262
  try:
263
  # Decode file content
264
  content = file_content.decode('utf-8')
265
 
266
+ # Parse CSV without headers first to access by column index
267
+ csv_reader = csv.reader(io.StringIO(content))
268
+ rows = list(csv_reader)
269
 
270
+ print(f"Total rows in CSV: {len(rows)}")
 
271
 
272
+ if len(rows) == 0:
273
+ print("CSV file is empty")
274
+ return []
275
 
276
+ # Check if we have at least column H (index 7, since A=0, B=1, ..., H=7)
277
+ first_row = rows[0]
278
+ print(f"First row columns: {len(first_row)}")
279
+ print(f"First row content: {first_row}")
 
 
 
 
 
 
 
 
280
 
281
+ if len(first_row) < 8: # Column H is index 7
282
+ print(f"CSV doesn't have column H. Only has {len(first_row)} columns")
 
283
  return []
284
 
285
+ print(f"Column H header: '{first_row[7]}'")
286
+
287
+ # Extract websites from column H (index 7)
288
+ # Start from row 1 to skip header
289
+ for i, row in enumerate(rows[1:], start=2): # Start=2 for proper row numbering
290
+ if len(row) > 7: # Make sure row has column H
291
+ website_url = row[7].strip()
292
+ if website_url:
293
+ websites.append(website_url)
294
+ print(f"Row {i}: Found website '{website_url}'")
295
+ else:
296
+ print(f"Row {i}: Column H is empty")
297
+ else:
298
+ print(f"Row {i}: Row too short, only has {len(row)} columns")
299
 
300
+ print(f"Extracted {len(websites)} websites from column H: {websites[:5]}...") # Show first 5
301
  return websites
302
 
303
  except Exception as e: