muhammadnoman76 commited on
Commit
2c8665a
·
1 Parent(s): b591a11
app/services/tools.py CHANGED
@@ -3,16 +3,10 @@ import logging
3
  import os
4
  from pathlib import Path
5
  from typing import Any, Dict, List, Optional
6
-
7
  from PIL import Image
8
-
9
  from app.services.vector_database_search import VectorDatabaseSearch
10
  from app.services.websearch import WebSearch
11
  from MagicConvert import MagicConvert
12
- from app.services.prompts import (
13
- ADVICE_REPORT_SUGGESTION,
14
- URDU_ADVICE_REPORT_SUGGESTION,
15
- )
16
  from app.services.image_classification_vit import SkinDiseaseClassifier
17
 
18
  try:
 
3
  import os
4
  from pathlib import Path
5
  from typing import Any, Dict, List, Optional
 
6
  from PIL import Image
 
7
  from app.services.vector_database_search import VectorDatabaseSearch
8
  from app.services.websearch import WebSearch
9
  from MagicConvert import MagicConvert
 
 
 
 
10
  from app.services.image_classification_vit import SkinDiseaseClassifier
11
 
12
  try:
app/services/websearch.py CHANGED
@@ -5,7 +5,8 @@ from bs4 import BeautifulSoup
5
  import urllib.parse
6
  import time
7
  import random
8
- from urllib.parse import urlparse, parse_qs
 
9
 
10
  warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
11
 
@@ -113,35 +114,33 @@ class WebSearch:
113
  # If any error occurs, return the original URL
114
  return url
115
 
116
- def extract_real_url_from_redirect(self, url):
117
- """Extract the actual URL from a redirect URL"""
118
  try:
119
- parsed = urlparse(url)
 
120
 
121
- # Handle DuckDuckGo redirects
122
- if "duckduckgo.com" in parsed.netloc and "u3=" in parsed.query:
123
- params = parse_qs(parsed.query)
124
- if "u3" in params and params["u3"]:
125
- redirect_url = params["u3"][0]
126
- # Handle nested redirects (like Bing redirects inside DuckDuckGo)
127
- if "bing.com/aclick" in redirect_url:
128
- bing_parsed = urlparse(redirect_url)
129
- bing_params = parse_qs(bing_parsed.query)
130
- if "u" in bing_params and bing_params["u"]:
131
- decoded_url = urllib.parse.unquote(bing_params["u"][0])
132
- return self.clean_url(decoded_url)
133
- return self.clean_url(redirect_url)
134
 
135
- # Handle Bing redirects
136
- if "bing.com/aclick" in url:
137
- params = parse_qs(parsed.query)
138
- if "u" in params and params["u"]:
139
- return self.clean_url(urllib.parse.unquote(params["u"][0]))
140
-
141
- return url
142
 
143
- except Exception:
144
- return url
 
 
 
 
 
 
 
 
145
 
146
  def extract_text_from_webpage(self, html_content):
147
  soup = BeautifulSoup(html_content, "html.parser")
@@ -159,34 +158,42 @@ class WebSearch:
159
 
160
  def search(self, query):
161
  results = []
162
- encoded_query = urllib.parse.quote(query)
163
- url = f'https://html.duckduckgo.com/html/?q={encoded_query}'
 
 
 
 
 
 
 
 
164
 
165
  try:
166
  with requests.Session() as session:
167
  session.headers.update(self.headers)
168
 
169
- response = session.get(url, timeout=10)
 
 
 
 
170
  soup = BeautifulSoup(response.text, 'html.parser')
171
 
172
- # Getting more results than needed to account for filtering
173
- search_results = soup.find_all('div', class_='result')[:self.num_results * 2]
174
  links = []
175
-
176
- # Extract and process links
177
- for result in search_results:
178
- link_tag = result.find('a', class_='result__a')
179
- if not link_tag or not link_tag.get('href'):
180
- continue
181
-
182
- original_link = link_tag['href']
183
-
184
- # Process link to get the actual URL
185
- clean_link = self.extract_real_url_from_redirect(original_link)
186
-
187
- # Validate the URL
188
- if self.is_valid_url(clean_link):
189
- links.append(clean_link)
190
 
191
  # Prioritize content domains
192
  prioritized_links = []
@@ -211,8 +218,7 @@ class WebSearch:
211
  unique_links.append(link)
212
  seen_domains.add(domain)
213
 
214
- from concurrent.futures import ThreadPoolExecutor, as_completed
215
-
216
  def fetch_page(link):
217
  try:
218
  # Random delay to avoid being blocked
 
5
  import urllib.parse
6
  import time
7
  import random
8
+ from urllib.parse import urlparse, parse_qs, unquote
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
 
11
  warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
12
 
 
114
  # If any error occurs, return the original URL
115
  return url
116
 
117
+ def extract_url_from_duckduckgo(self, ddg_link):
118
+ """Simplified extraction of actual URL from DuckDuckGo redirect"""
119
  try:
120
+ # Parse the DuckDuckGo URL
121
+ parsed = urlparse(ddg_link)
122
 
123
+ # Get the uddg parameter which contains the actual URL
124
+ params = parse_qs(parsed.query)
125
+ uddg = params.get('uddg', [''])[0]
 
 
 
 
 
 
 
 
 
 
126
 
127
+ # Decode the URL
128
+ if uddg:
129
+ actual_url = unquote(uddg)
130
+ else:
131
+ # Fallback to original if no uddg parameter
132
+ actual_url = ddg_link
 
133
 
134
+ # Ensure proper URL format
135
+ if actual_url and not actual_url.startswith(("http://", "https://")):
136
+ actual_url = "https://" + actual_url
137
+
138
+ # Clean the URL from tracking parameters
139
+ return self.clean_url(actual_url)
140
+
141
+ except Exception as e:
142
+ print(f"Error extracting URL: {e}")
143
+ return ddg_link
144
 
145
  def extract_text_from_webpage(self, html_content):
146
  soup = BeautifulSoup(html_content, "html.parser")
 
158
 
159
  def search(self, query):
160
  results = []
161
+
162
+ # Sanitize query
163
+ query = re.sub(r'[<>"\']', "", query.strip())
164
+ if not query:
165
+ print("Empty search query")
166
+ return []
167
+
168
+ # Setup request parameters
169
+ params = {"q": query, "kl": "us-en"}
170
+ url = "https://html.duckduckgo.com/html/"
171
 
172
  try:
173
  with requests.Session() as session:
174
  session.headers.update(self.headers)
175
 
176
+ # Make search request
177
+ response = session.get(url, params=params, timeout=10)
178
+ response.raise_for_status()
179
+
180
+ # Parse results
181
  soup = BeautifulSoup(response.text, 'html.parser')
182
 
183
+ # Extract links using simplified approach
 
184
  links = []
185
+ for result in soup.select("div.result"):
186
+ title_tag = result.select_one("a.result__a")
187
+ if title_tag:
188
+ # Get the href attribute
189
+ raw_link = title_tag.get("href", "")
190
+ if raw_link:
191
+ # Extract actual URL from DuckDuckGo redirect
192
+ actual_url = self.extract_url_from_duckduckgo(raw_link)
193
+
194
+ # Validate the URL
195
+ if self.is_valid_url(actual_url):
196
+ links.append(actual_url)
 
 
 
197
 
198
  # Prioritize content domains
199
  prioritized_links = []
 
218
  unique_links.append(link)
219
  seen_domains.add(domain)
220
 
221
+ # Fetch page content
 
222
  def fetch_page(link):
223
  try:
224
  # Random delay to avoid being blocked
ddg_response.html ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> -->
2
+ <!DOCTYPE html>
3
+ <html lang="en">
4
+
5
+ <head>
6
+ <link rel="canonical" href="https://duckduckgo.com/">
7
+ <meta http-equiv="content-type" content="text/html; charset=UTF-8">
8
+ <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=3.0, user-scalable=1">
9
+ <meta name="referrer" content="origin">
10
+ <title>
11
+ DuckDuckGo
12
+ </title>
13
+ <link rel="preload" href="/font/ProximaNova-Reg-webfont.woff2" as="font" type="font/woff2" crossorigin="anonymous">
14
+ <link rel="preload" href="/font/ProximaNova-Sbold-webfont.woff2" as="font" type="font/woff2" crossorigin="anonymous">
15
+ <link title="DuckDuckGo (Lite)" type="application/opensearchdescription+xml" rel="search" href="//duckduckgo.com/opensearch_lite_v2.xml">
16
+ <link rel="icon" href="//duckduckgo.com/favicon.ico" type="image/x-icon"/>
17
+ <link id="icon60" rel="apple-touch-icon" href="//duckduckgo.com/assets/icons/meta/DDG-iOS-icon_60x60.png?v=2"/>
18
+ <link id="icon76" rel="apple-touch-icon" sizes="76x76" href="//duckduckgo.com/assets/icons/meta/DDG-iOS-icon_76x76.png?v=2"/>
19
+ <link id="icon120" rel="apple-touch-icon" sizes="120x120" href="//duckduckgo.com/assets/icons/meta/DDG-iOS-icon_120x120.png?v=2"/>
20
+ <link id="icon152" rel="apple-touch-icon" sizes="152x152" href="//duckduckgo.com/assets/icons/meta/DDG-iOS-icon_152x152.png?v=2"/>
21
+ <link rel="image_src" href="//duckduckgo.com/assets/icons/meta/DDG-icon_256x256.png">
22
+ <link rel="stylesheet" media="handheld, all" href="//duckduckgo.com/dist/l.dedf9a21e97bacda4782.css" type="text/css"/>
23
+ <link rel="stylesheet" media="handheld, all" href="//duckduckgo.com/dist/lc.ddfa8c0ba43459a9159a.css" type="text/css"/>
24
+ </head>
25
+
26
+ <body>
27
+ <a name="top"></a>
28
+ <center id="lite_wrapper">
29
+ <br>
30
+ <a class="header-url" href="/html/">
31
+ <span class="header">DuckDuckGo</span>
32
+ </a>
33
+ <br><br>
34
+
35
+
36
+ <iframe name="ifr" width="0" height="0" border="0" class="hidden"></iframe>
37
+ <form id="img-form" action="//duckduckgo.com/anomaly.js?sv=html&cc=botnet&ti=1758386895&gk=d4cd0dabcf4caa22ad92fab40844c786&p=d72c36ab4cb24f49969ecef4b2b88ad1-a3ec151498ae45aa845c5dda2f14a862-61b69bac6cc64d5886ca11ee9fc0eef2-d2cb2f084d1d45b6a98513a79b944bbc-5e6714da571b4c8895e1d949f47cedc2-23c1e3ae8ce8443183e624800acf2f47-ae7275bb00734944820af39a39d39107-d13828125211402eaef44405520c4080-aa87423535154d4f8f2a1cfff1de7a8b&q=acne&o=FYGE4bAiNVr8IcPNJsONTSwq441QyPzksuoqxWPyWuuL0OwnOoQUGNKgGm2X8RYI%0A&r=inc" target="ifr" method="POST"></form>
38
+ <form id="challenge-form" action="//duckduckgo.com/anomaly.js?sv=html&cc=botnet&st=1758386895&gk=d4cd0dabcf4caa22ad92fab40844c786&p=d72c36ab4cb24f49969ecef4b2b88ad1-a3ec151498ae45aa845c5dda2f14a862-61b69bac6cc64d5886ca11ee9fc0eef2-d2cb2f084d1d45b6a98513a79b944bbc-5e6714da571b4c8895e1d949f47cedc2-23c1e3ae8ce8443183e624800acf2f47-ae7275bb00734944820af39a39d39107-d13828125211402eaef44405520c4080-aa87423535154d4f8f2a1cfff1de7a8b&q=acne&o=FYGE4bAiNVr8IcPNJsONTSwq441QyPzksuoqxWPyWuuL0OwnOoQUGNKgGm2X8RYI%0A&r=inc" method="POST">
39
+ <div class="anomaly-modal__mask">
40
+ <div class="anomaly-modal__modal is-ie" data-testid="anomaly-modal">
41
+ <div class="anomaly-modal__title">Unfortunately, bots use DuckDuckGo too.</div>
42
+ <div class="anomaly-modal__description">Please complete the following challenge to confirm this search was made by a human.</div>
43
+ <div class="anomaly-modal__instructions">Select all squares containing a duck:</div>
44
+ <div class="anomaly-modal__puzzle-margins">
45
+ <div class="anomaly-modal__puzzle">
46
+
47
+ <div class="anomaly-modal__box" data-index="0">
48
+
49
+
50
+ <label class="" for="image-check_d72c36ab4cb24f49969ecef4b2b88ad1" data-testid="anomaly-modal-tile-0">
51
+ <input type="checkbox" class="anomaly-modal__check" name="image-check_d72c36ab4cb24f49969ecef4b2b88ad1" id="image-check_d72c36ab4cb24f49969ecef4b2b88ad1">
52
+ <img class="anomaly-modal__image" alt=" " id="image-d72c36ab4cb24f49969ecef4b2b88ad1" src="../assets/anomaly/images/challenge/d72c36ab4cb24f49969ecef4b2b88ad1.jpg" data-id="d72c36ab4cb24f49969ecef4b2b88ad1.jpg" data-testid="anomaly-modal-image-0"></img>
53
+ </label>
54
+
55
+ </div>
56
+
57
+ <div class="anomaly-modal__box" data-index="1">
58
+
59
+
60
+ <label class="" for="image-check_a3ec151498ae45aa845c5dda2f14a862" data-testid="anomaly-modal-tile-1">
61
+ <input type="checkbox" class="anomaly-modal__check" name="image-check_a3ec151498ae45aa845c5dda2f14a862" id="image-check_a3ec151498ae45aa845c5dda2f14a862">
62
+ <img class="anomaly-modal__image" alt=" " id="image-a3ec151498ae45aa845c5dda2f14a862" src="../assets/anomaly/images/challenge/a3ec151498ae45aa845c5dda2f14a862.jpg" data-id="a3ec151498ae45aa845c5dda2f14a862.jpg" data-testid="anomaly-modal-image-1"></img>
63
+ </label>
64
+
65
+ </div>
66
+
67
+ <div class="anomaly-modal__box" data-index="2">
68
+
69
+
70
+ <label class="" for="image-check_61b69bac6cc64d5886ca11ee9fc0eef2" data-testid="anomaly-modal-tile-2">
71
+ <input type="checkbox" class="anomaly-modal__check" name="image-check_61b69bac6cc64d5886ca11ee9fc0eef2" id="image-check_61b69bac6cc64d5886ca11ee9fc0eef2">
72
+ <img class="anomaly-modal__image" alt=" " id="image-61b69bac6cc64d5886ca11ee9fc0eef2" src="../assets/anomaly/images/challenge/61b69bac6cc64d5886ca11ee9fc0eef2.jpg" data-id="61b69bac6cc64d5886ca11ee9fc0eef2.jpg" data-testid="anomaly-modal-image-2"></img>
73
+ </label>
74
+
75
+ </div>
76
+
77
+ <div class="anomaly-modal__box" data-index="3">
78
+
79
+
80
+ <label class="" for="image-check_d2cb2f084d1d45b6a98513a79b944bbc" data-testid="anomaly-modal-tile-3">
81
+ <input type="checkbox" class="anomaly-modal__check" name="image-check_d2cb2f084d1d45b6a98513a79b944bbc" id="image-check_d2cb2f084d1d45b6a98513a79b944bbc">
82
+ <img class="anomaly-modal__image" alt=" " id="image-d2cb2f084d1d45b6a98513a79b944bbc" src="../assets/anomaly/images/challenge/d2cb2f084d1d45b6a98513a79b944bbc.jpg" data-id="d2cb2f084d1d45b6a98513a79b944bbc.jpg" data-testid="anomaly-modal-image-3"></img>
83
+ </label>
84
+
85
+ </div>
86
+
87
+ <div class="anomaly-modal__box" data-index="4">
88
+
89
+
90
+ <label class="" for="image-check_5e6714da571b4c8895e1d949f47cedc2" data-testid="anomaly-modal-tile-4">
91
+ <input type="checkbox" class="anomaly-modal__check" name="image-check_5e6714da571b4c8895e1d949f47cedc2" id="image-check_5e6714da571b4c8895e1d949f47cedc2">
92
+ <img class="anomaly-modal__image" alt=" " id="image-5e6714da571b4c8895e1d949f47cedc2" src="../assets/anomaly/images/challenge/5e6714da571b4c8895e1d949f47cedc2.jpg" data-id="5e6714da571b4c8895e1d949f47cedc2.jpg" data-testid="anomaly-modal-image-4"></img>
93
+ </label>
94
+
95
+ </div>
96
+
97
+ <div class="anomaly-modal__box" data-index="5">
98
+
99
+
100
+ <label class="" for="image-check_23c1e3ae8ce8443183e624800acf2f47" data-testid="anomaly-modal-tile-5">
101
+ <input type="checkbox" class="anomaly-modal__check" name="image-check_23c1e3ae8ce8443183e624800acf2f47" id="image-check_23c1e3ae8ce8443183e624800acf2f47">
102
+ <img class="anomaly-modal__image" alt=" " id="image-23c1e3ae8ce8443183e624800acf2f47" src="../assets/anomaly/images/challenge/23c1e3ae8ce8443183e624800acf2f47.jpg" data-id="23c1e3ae8ce8443183e624800acf2f47.jpg" data-testid="anomaly-modal-image-5"></img>
103
+ </label>
104
+
105
+ </div>
106
+
107
+ <div class="anomaly-modal__box" data-index="6">
108
+
109
+
110
+ <label class="" for="image-check_ae7275bb00734944820af39a39d39107" data-testid="anomaly-modal-tile-6">
111
+ <input type="checkbox" class="anomaly-modal__check" name="image-check_ae7275bb00734944820af39a39d39107" id="image-check_ae7275bb00734944820af39a39d39107">
112
+ <img class="anomaly-modal__image" alt=" " id="image-ae7275bb00734944820af39a39d39107" src="../assets/anomaly/images/challenge/ae7275bb00734944820af39a39d39107.jpg" data-id="ae7275bb00734944820af39a39d39107.jpg" data-testid="anomaly-modal-image-6"></img>
113
+ </label>
114
+
115
+ </div>
116
+
117
+ <div class="anomaly-modal__box" data-index="7">
118
+
119
+
120
+ <label class="" for="image-check_d13828125211402eaef44405520c4080" data-testid="anomaly-modal-tile-7">
121
+ <input type="checkbox" class="anomaly-modal__check" name="image-check_d13828125211402eaef44405520c4080" id="image-check_d13828125211402eaef44405520c4080">
122
+ <img class="anomaly-modal__image" alt=" " id="image-d13828125211402eaef44405520c4080" src="../assets/anomaly/images/challenge/d13828125211402eaef44405520c4080.jpg" data-id="d13828125211402eaef44405520c4080.jpg" data-testid="anomaly-modal-image-7"></img>
123
+ </label>
124
+
125
+ </div>
126
+
127
+ <div class="anomaly-modal__box" data-index="8">
128
+
129
+
130
+ <label class="" for="image-check_aa87423535154d4f8f2a1cfff1de7a8b" data-testid="anomaly-modal-tile-8">
131
+ <input type="checkbox" class="anomaly-modal__check" name="image-check_aa87423535154d4f8f2a1cfff1de7a8b" id="image-check_aa87423535154d4f8f2a1cfff1de7a8b">
132
+ <img class="anomaly-modal__image" alt=" " id="image-aa87423535154d4f8f2a1cfff1de7a8b" src="../assets/anomaly/images/challenge/aa87423535154d4f8f2a1cfff1de7a8b.jpg" data-id="aa87423535154d4f8f2a1cfff1de7a8b.jpg" data-testid="anomaly-modal-image-8"></img>
133
+ </label>
134
+
135
+ </div>
136
+
137
+ </div>
138
+ </div>
139
+ <div class="anomaly-modal__controls">
140
+ <button name="challenge-submit" class="btn btn--primary anomaly-modal__submit js-anomaly-modal-submit" form="challenge-form" value="d4cd0dabcf4caa22ad92fab40844c786" >Submit</button>
141
+ <p></p>
142
+ <input class="feedback-toggle" type="checkbox" id="feedback" unchecked />
143
+ <label for="feedback" class="feedback-text">Images not loading?</label>
144
+ <div class="feedback-content">
145
+ <label class="feedback-instructions">Please email the following code to:</label>
146
+ <p class="feedback-instructions">error-lite@duckduckgo.com</p>
147
+ <p><label class="feedback-instructions">Code: d4cd0dabcf4caa22ad92fab40844c786</label></p>
148
+ </div>
149
+ </div>
150
+ </div>
151
+ </form>
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+ <br>
162
+ <p class='html-only'>
163
+ <!-- This is the Lite version. Get the full-JS version <a href="https://duckduckgo.com/">here</a>. //-->
164
+ </p>
165
+ </center>
166
+
167
+ <img src="//duckduckgo.com/t/tqadb?2019127&amp;s=lite&amp;cc=botnet&amp;sc=1&amp;i=-1&amp;p=d72c36ab4cb24f49969ecef4b2b88ad1-a3ec151498ae45aa845c5dda2f14a862-61b69bac6cc64d5886ca11ee9fc0eef2-d2cb2f084d1d45b6a98513a79b944bbc-5e6714da571b4c8895e1d949f47cedc2-23c1e3ae8ce8443183e624800acf2f47-ae7275bb00734944820af39a39d39107-d13828125211402eaef44405520c4080-aa87423535154d4f8f2a1cfff1de7a8b&amp;iadb=0&amp;gk=d4cd0dabcf4caa22ad92fab40844c786&amp;c=dadb&amp;e=0&amp;o=FYGE4bAiNVr8IcPNJsONTSwq441QyPzksuoqxWPyWuuL0OwnOoQUGNKgGm2X8RYI%0A&amp;r=inc"/>
168
+
169
+
170
+
171
+
172
+
173
+
174
+ </body>
175
+
176
+ </html>