JayBene1 commited on
Commit
d4231f8
·
verified ·
1 Parent(s): 9ebdf6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -45
app.py CHANGED
@@ -3,6 +3,7 @@ import requests
3
  import json
4
  from typing import List, Dict, Any
5
  import pandas as pd
 
6
 
7
  # Hardcoded API endpoint - Updated to correct endpoint
8
  API_ENDPOINT = "https://jaybene1-testapicontacts.hf.space/contacts"
@@ -34,6 +35,107 @@ theme = gr.themes.Soft(
34
  button_secondary_text_color="#334155"
35
  )
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def search_contacts(url: str) -> tuple[str, str]:
38
  """
39
  Search for contacts associated with a given URL using the Hugging Face API.
@@ -50,6 +152,7 @@ def search_contacts(url: str) -> tuple[str, str]:
50
 
51
  # Clean and validate URL
52
  url = url.strip()
 
53
  if not url.startswith(('http://', 'https://')):
54
  url = 'https://' + url
55
 
@@ -70,9 +173,6 @@ def search_contacts(url: str) -> tuple[str, str]:
70
  try:
71
  result = response.json()
72
 
73
- # Add debugging info
74
- request_url = f"{API_ENDPOINT}?url={url}"
75
-
76
  # Hugging Face Spaces API returns data in a different format
77
  if isinstance(result, dict) and 'data' in result:
78
  actual_result = result['data'][0] if result['data'] else {}
@@ -81,19 +181,51 @@ def search_contacts(url: str) -> tuple[str, str]:
81
 
82
  # Apply client-side filtering
83
  if isinstance(actual_result, list):
84
- actual_result = filter_contacts_by_url(actual_result, url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  elif isinstance(actual_result, dict) and 'contacts' in actual_result:
86
- actual_result['contacts'] = filter_contacts_by_url(actual_result['contacts'], url)
87
-
88
- # Format the results for display
89
- formatted_output = format_contact_results(actual_result, url)
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- # Add debugging info to the formatted output
92
  debug_info = f"\n\n**Debug Info:**\n"
93
  debug_info += f"- Request URL: {API_ENDPOINT}\n"
94
  debug_info += f"- Search URL: {url}\n"
95
- debug_info += f"- Response contains {len(actual_result) if isinstance(actual_result, list) else 'N/A'} items\n"
96
- debug_info += f"- Client-side filtering applied\n"
 
 
 
 
97
 
98
  formatted_output += debug_info
99
  raw_json = json.dumps(result, indent=2)
@@ -113,40 +245,6 @@ def search_contacts(url: str) -> tuple[str, str]:
113
  except Exception as e:
114
  return f"❌ Error: {str(e)}", ""
115
 
116
- def filter_contacts_by_url(contacts, search_url):
117
- """
118
- Filter contacts based on the search URL.
119
- This is a client-side filter in case the API doesn't filter properly.
120
- """
121
- if not isinstance(contacts, list):
122
- return contacts
123
-
124
- # Extract domain from search URL
125
- search_domain = search_url.replace('https://', '').replace('http://', '').replace('www.', '').split('/')[0].lower()
126
-
127
- filtered_contacts = []
128
- for contact in contacts:
129
- # Check if contact is associated with the search domain
130
- if isinstance(contact, dict):
131
- # Check various fields that might contain the domain
132
- fields_to_check = [
133
- contact.get('website', ''),
134
- contact.get('company_website', ''),
135
- contact.get('domain', ''),
136
- contact.get('email', ''),
137
- contact.get('company', ''),
138
- str(contact.get('url', '')),
139
- str(contact.get('source', ''))
140
- ]
141
-
142
- # Check if any field contains the search domain
143
- for field in fields_to_check:
144
- if field and search_domain in str(field).lower():
145
- filtered_contacts.append(contact)
146
- break
147
-
148
- return filtered_contacts if filtered_contacts else contacts
149
-
150
  def format_contact_results(results: Dict[Any, Any], url: str) -> str:
151
  """
152
  Format the API results into a readable format.
@@ -312,6 +410,7 @@ with gr.Blocks(theme=theme, title="Contact Search - Kwekel Companies", css="""
312
  <li>View formatted results and raw JSON response</li>
313
  </ol>
314
  <p><strong>API Endpoint:</strong> {API_ENDPOINT}</p>
 
315
  </div>
316
  """)
317
 
 
3
  import json
4
  from typing import List, Dict, Any
5
  import pandas as pd
6
+ from urllib.parse import urlparse
7
 
8
  # Hardcoded API endpoint - Updated to correct endpoint
9
  API_ENDPOINT = "https://jaybene1-testapicontacts.hf.space/contacts"
 
35
  button_secondary_text_color="#334155"
36
  )
37
 
38
+ def normalize_url(url: str) -> str:
39
+ """
40
+ Normalize URL for consistent comparison.
41
+
42
+ Args:
43
+ url: The URL to normalize
44
+
45
+ Returns:
46
+ Normalized URL string
47
+ """
48
+ if not url:
49
+ return ""
50
+
51
+ # Remove protocol and www
52
+ normalized = url.lower().replace('https://', '').replace('http://', '').replace('www.', '')
53
+
54
+ # Remove trailing slash and path
55
+ normalized = normalized.split('/')[0]
56
+
57
+ # Remove port numbers
58
+ normalized = normalized.split(':')[0]
59
+
60
+ return normalized
61
+
62
+ def extract_domain_variations(url: str) -> List[str]:
63
+ """
64
+ Extract various domain variations for matching.
65
+
66
+ Args:
67
+ url: The URL to extract domains from
68
+
69
+ Returns:
70
+ List of domain variations
71
+ """
72
+ normalized = normalize_url(url)
73
+ variations = [normalized]
74
+
75
+ # Add with www
76
+ variations.append(f"www.{normalized}")
77
+
78
+ # Add subdomains if any
79
+ if '.' in normalized:
80
+ parts = normalized.split('.')
81
+ if len(parts) > 2:
82
+ # Add without subdomain
83
+ variations.append('.'.join(parts[-2:]))
84
+
85
+ return variations
86
+
87
+ def filter_contacts_by_url(contacts, search_url):
88
+ """
89
+ Enhanced filter contacts based on the search URL.
90
+ This is a client-side filter in case the API doesn't filter properly.
91
+ """
92
+ if not isinstance(contacts, list):
93
+ return contacts
94
+
95
+ # Get domain variations for the search URL
96
+ search_domains = extract_domain_variations(search_url)
97
+
98
+ filtered_contacts = []
99
+ for contact in contacts:
100
+ if isinstance(contact, dict):
101
+ # Check various fields that might contain the domain
102
+ fields_to_check = [
103
+ contact.get('website', ''),
104
+ contact.get('company_website', ''),
105
+ contact.get('domain', ''),
106
+ contact.get('email', ''),
107
+ contact.get('company', ''),
108
+ str(contact.get('url', '')),
109
+ str(contact.get('source', '')),
110
+ str(contact.get('company_url', '')),
111
+ str(contact.get('origin_url', ''))
112
+ ]
113
+
114
+ # Check if any field contains any of the search domains
115
+ contact_matched = False
116
+ for field in fields_to_check:
117
+ if field:
118
+ field_normalized = normalize_url(str(field))
119
+ for search_domain in search_domains:
120
+ if search_domain in field_normalized or field_normalized in search_domain:
121
+ filtered_contacts.append(contact)
122
+ contact_matched = True
123
+ break
124
+ if contact_matched:
125
+ break
126
+
127
+ # Also check email domains
128
+ if not contact_matched:
129
+ email = contact.get('email', '')
130
+ if email and '@' in email:
131
+ email_domain = email.split('@')[1].lower()
132
+ for search_domain in search_domains:
133
+ if search_domain in email_domain or email_domain in search_domain:
134
+ filtered_contacts.append(contact)
135
+ break
136
+
137
+ return filtered_contacts
138
+
139
  def search_contacts(url: str) -> tuple[str, str]:
140
  """
141
  Search for contacts associated with a given URL using the Hugging Face API.
 
152
 
153
  # Clean and validate URL
154
  url = url.strip()
155
+ original_url = url
156
  if not url.startswith(('http://', 'https://')):
157
  url = 'https://' + url
158
 
 
173
  try:
174
  result = response.json()
175
 
 
 
 
176
  # Hugging Face Spaces API returns data in a different format
177
  if isinstance(result, dict) and 'data' in result:
178
  actual_result = result['data'][0] if result['data'] else {}
 
181
 
182
  # Apply client-side filtering
183
  if isinstance(actual_result, list):
184
+ original_count = len(actual_result)
185
+ filtered_result = filter_contacts_by_url(actual_result, url)
186
+
187
+ # If we filtered out everything or got very few results compared to original,
188
+ # it's likely the API returned all contacts instead of URL-specific ones
189
+ if len(filtered_result) == 0:
190
+ return f"❌ No contacts found for: {original_url}\n\nThis URL may not be in our database.", json.dumps(result, indent=2)
191
+ elif original_count > 50 and len(filtered_result) < 5:
192
+ # Likely got all contacts, filtered to very few - show warning
193
+ formatted_output = f"⚠️ **Warning:** API returned {original_count} total contacts. After filtering for '{original_url}', found {len(filtered_result)} matches.\n\n"
194
+ formatted_output += "These results may not be accurate. The URL might not be in our database.\n\n"
195
+ formatted_output += format_contact_results(filtered_result, url)
196
+ else:
197
+ formatted_output = format_contact_results(filtered_result, url)
198
+
199
+ actual_result = filtered_result
200
+
201
  elif isinstance(actual_result, dict) and 'contacts' in actual_result:
202
+ original_count = len(actual_result['contacts'])
203
+ filtered_contacts = filter_contacts_by_url(actual_result['contacts'], url)
204
+
205
+ if len(filtered_contacts) == 0:
206
+ return f"❌ No contacts found for: {original_url}\n\nThis URL may not be in our database.", json.dumps(result, indent=2)
207
+ elif original_count > 50 and len(filtered_contacts) < 5:
208
+ formatted_output = f"⚠️ **Warning:** API returned {original_count} total contacts. After filtering for '{original_url}', found {len(filtered_contacts)} matches.\n\n"
209
+ formatted_output += "These results may not be accurate. The URL might not be in our database.\n\n"
210
+ formatted_output += format_contact_results({'contacts': filtered_contacts}, url)
211
+ else:
212
+ formatted_output = format_contact_results({'contacts': filtered_contacts}, url)
213
+
214
+ actual_result['contacts'] = filtered_contacts
215
+ else:
216
+ # Single contact or unknown format
217
+ formatted_output = format_contact_results(actual_result, url)
218
 
219
+ # Add debugging info
220
  debug_info = f"\n\n**Debug Info:**\n"
221
  debug_info += f"- Request URL: {API_ENDPOINT}\n"
222
  debug_info += f"- Search URL: {url}\n"
223
+ debug_info += f"- Original response size: {len(result.get('data', [result])) if isinstance(result, dict) else len(result) if isinstance(result, list) else 'N/A'}\n"
224
+ debug_info += f"- Filtered results: {len(actual_result) if isinstance(actual_result, list) else len(actual_result.get('contacts', [])) if isinstance(actual_result, dict) else 'N/A'}\n"
225
+ debug_info += f"- Client-side filtering applied: Yes\n"
226
+
227
+ if 'formatted_output' not in locals():
228
+ formatted_output = format_contact_results(actual_result, url)
229
 
230
  formatted_output += debug_info
231
  raw_json = json.dumps(result, indent=2)
 
245
  except Exception as e:
246
  return f"❌ Error: {str(e)}", ""
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  def format_contact_results(results: Dict[Any, Any], url: str) -> str:
249
  """
250
  Format the API results into a readable format.
 
410
  <li>View formatted results and raw JSON response</li>
411
  </ol>
412
  <p><strong>API Endpoint:</strong> {API_ENDPOINT}</p>
413
+ <p><strong>Note:</strong> The system will filter results to match your specific URL and warn you if the URL might not be in the database.</p>
414
  </div>
415
  """)
416