Update app.py
Browse files
app.py
CHANGED
|
@@ -257,47 +257,47 @@ def simulate_website_scraping(url):
|
|
| 257 |
return contacts
|
| 258 |
|
| 259 |
def parse_csv_file(file_content):
|
| 260 |
-
"""Parse CSV file and extract website URLs"""
|
| 261 |
websites = []
|
| 262 |
try:
|
| 263 |
# Decode file content
|
| 264 |
content = file_content.decode('utf-8')
|
| 265 |
|
| 266 |
-
# Parse CSV
|
| 267 |
-
csv_reader = csv.
|
|
|
|
| 268 |
|
| 269 |
-
|
| 270 |
-
website_columns = ['website', 'url', 'domain', 'site', 'web', 'homepage']
|
| 271 |
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
|
| 276 |
-
#
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
# Clean and normalize column name for comparison
|
| 281 |
-
clean_col_name = col_name.lower().strip()
|
| 282 |
-
print(f"Checking column: '{col_name}' -> '{clean_col_name}'")
|
| 283 |
-
|
| 284 |
-
if clean_col_name in website_columns:
|
| 285 |
-
website_column = col_name # Use original column name
|
| 286 |
-
print(f"Using website column: '{website_column}'")
|
| 287 |
-
break
|
| 288 |
|
| 289 |
-
if
|
| 290 |
-
print(f"
|
| 291 |
-
print(f"Looking for columns matching: {website_columns}")
|
| 292 |
return []
|
| 293 |
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
-
print(f"Extracted {len(websites)} websites: {websites[:5]}...") # Show first 5
|
| 301 |
return websites
|
| 302 |
|
| 303 |
except Exception as e:
|
|
|
|
| 257 |
return contacts
|
| 258 |
|
| 259 |
def parse_csv_file(file_content):
|
| 260 |
+
"""Parse CSV file and extract website URLs from column H"""
|
| 261 |
websites = []
|
| 262 |
try:
|
| 263 |
# Decode file content
|
| 264 |
content = file_content.decode('utf-8')
|
| 265 |
|
| 266 |
+
# Parse CSV without headers first to access by column index
|
| 267 |
+
csv_reader = csv.reader(io.StringIO(content))
|
| 268 |
+
rows = list(csv_reader)
|
| 269 |
|
| 270 |
+
print(f"Total rows in CSV: {len(rows)}")
|
|
|
|
| 271 |
|
| 272 |
+
if len(rows) == 0:
|
| 273 |
+
print("CSV file is empty")
|
| 274 |
+
return []
|
| 275 |
|
| 276 |
+
# Check if we have at least column H (index 7, since A=0, B=1, ..., H=7)
|
| 277 |
+
first_row = rows[0]
|
| 278 |
+
print(f"First row columns: {len(first_row)}")
|
| 279 |
+
print(f"First row content: {first_row}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
+
if len(first_row) < 8: # Column H is index 7
|
| 282 |
+
print(f"CSV doesn't have column H. Only has {len(first_row)} columns")
|
|
|
|
| 283 |
return []
|
| 284 |
|
| 285 |
+
print(f"Column H header: '{first_row[7]}'")
|
| 286 |
+
|
| 287 |
+
# Extract websites from column H (index 7)
|
| 288 |
+
# Start from row 1 to skip header
|
| 289 |
+
for i, row in enumerate(rows[1:], start=2): # Start=2 for proper row numbering
|
| 290 |
+
if len(row) > 7: # Make sure row has column H
|
| 291 |
+
website_url = row[7].strip()
|
| 292 |
+
if website_url:
|
| 293 |
+
websites.append(website_url)
|
| 294 |
+
print(f"Row {i}: Found website '{website_url}'")
|
| 295 |
+
else:
|
| 296 |
+
print(f"Row {i}: Column H is empty")
|
| 297 |
+
else:
|
| 298 |
+
print(f"Row {i}: Row too short, only has {len(row)} columns")
|
| 299 |
|
| 300 |
+
print(f"Extracted {len(websites)} websites from column H: {websites[:5]}...") # Show first 5
|
| 301 |
return websites
|
| 302 |
|
| 303 |
except Exception as e:
|