Juanoto2012 commited on
Commit
1b3da14
·
verified ·
1 Parent(s): ae5ce8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -42
app.py CHANGED
@@ -4,10 +4,11 @@ import requests
4
  from bs4 import BeautifulSoup
5
  import uvicorn
6
  import urllib.parse
7
- from duckduckgo_search import DDGS # <- Nueva importación para imágenes
8
 
9
- app = FastAPI(title="Glimpse Scraping API", description="API de Scraping directo e Imágenes para buscadores")
10
 
 
11
  app.add_middleware(
12
  CORSMiddleware,
13
  allow_origins=["*"],
@@ -16,13 +17,12 @@ app.add_middleware(
16
  allow_headers=["*"],
17
  )
18
 
19
- # User-Agent de un navegador real para evitar bloqueos en texto
20
  HEADERS = {
21
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
22
  "Accept-Language": "es-ES,es;q=0.9,en;q=0.8"
23
  }
24
 
25
- # --- FUNCIONES DE SCRAPING DE TEXTO (WEB) ---
26
 
27
  def scrape_duckduckgo(query: str):
28
  url = f"https://html.duckduckgo.com/html/?q={urllib.parse.quote(query)}"
@@ -34,11 +34,9 @@ def scrape_duckduckgo(query: str):
34
  for result in soup.select('.result__body'):
35
  title_tag = result.select_one('.result__title a')
36
  snippet_tag = result.select_one('.result__snippet')
37
-
38
  if title_tag:
39
  raw_url = title_tag.get('href', '')
40
  clean_url = urllib.parse.unquote(raw_url.replace('//duckduckgo.com/l/?uddg=', '').split('&')[0])
41
-
42
  results.append({
43
  "title": title_tag.text.strip(),
44
  "url": clean_url if clean_url.startswith('http') else raw_url,
@@ -51,12 +49,10 @@ def scrape_mojeek(query: str):
51
  resp = requests.get(url, headers=HEADERS, timeout=10)
52
  resp.raise_for_status()
53
  soup = BeautifulSoup(resp.text, 'lxml')
54
-
55
  results = []
56
  for li in soup.select('ul.results-standard > li'):
57
  a_tag = li.select_one('a.ob')
58
  p_tag = li.select_one('p.s')
59
-
60
  if a_tag:
61
  results.append({
62
  "title": a_tag.text.strip(),
@@ -70,12 +66,10 @@ def scrape_qwant(query: str):
70
  resp = requests.get(url, headers=HEADERS, timeout=10)
71
  resp.raise_for_status()
72
  soup = BeautifulSoup(resp.text, 'lxml')
73
-
74
  results = []
75
  for article in soup.select('article.result'):
76
  title_tag = article.select_one('h2 a')
77
  snippet_tag = article.select_one('.result-snippet')
78
-
79
  if title_tag:
80
  results.append({
81
  "title": title_tag.text.strip(),
@@ -89,13 +83,11 @@ def scrape_brave(query: str):
89
  resp = requests.get(url, headers=HEADERS, timeout=10)
90
  resp.raise_for_status()
91
  soup = BeautifulSoup(resp.text, 'lxml')
92
-
93
  results = []
94
  for snippet in soup.select('.snippet'):
95
  title_tag = snippet.select_one('.heading')
96
  link_tag = snippet.select_one('a')
97
  desc_tag = snippet.select_one('.snippet-content, .snippet-description')
98
-
99
  if title_tag and link_tag:
100
  results.append({
101
  "title": title_tag.text.strip(),
@@ -104,66 +96,67 @@ def scrape_brave(query: str):
104
  })
105
  return results
106
 
107
- # --- ENDPOINTS DE LA API ---
108
 
109
  @app.get("/")
110
  def read_root():
111
- return {"status": "Glimpse API is running! Endpoints: /search, /images"}
112
 
113
  @app.get("/search")
114
  def search(
115
  q: str = Query(..., description="Término de búsqueda"),
116
- engine: str = Query("duckduckgo", description="duckduckgo, mojeek, qwant, brave")
117
  ):
118
  engine = engine.lower()
119
  results = []
120
-
121
  try:
122
- if engine == "duckduckgo":
123
- results = scrape_duckduckgo(q)
124
- elif engine == "mojeek":
125
  results = scrape_mojeek(q)
126
  elif engine in ["qwant", "tapnav"]:
127
  results = scrape_qwant(q)
128
  elif engine == "brave":
129
  results = scrape_brave(q)
130
- elif engine == "startpage":
131
- results = scrape_duckduckgo(q)
132
  else:
133
- results = scrape_duckduckgo(q)
134
-
135
- except requests.exceptions.HTTPError as e:
136
- if e.response.status_code == 403:
137
- raise HTTPException(status_code=403, detail=f"El motor {engine} nos bloqueó.")
138
- raise HTTPException(status_code=500, detail=str(e))
139
  except Exception as e:
140
- raise HTTPException(status_code=500, detail=f"Error al procesar el HTML de {engine}: {str(e)}")
141
-
142
  return {"results": results}
143
 
144
- # --- NUEVO ENDPOINT PARA IMÁGENES ---
145
-
146
  @app.get("/images")
147
  def search_images(
148
  q: str = Query(..., description="Término de búsqueda para imágenes"),
149
- max_results: int = Query(30, description="Cantidad máxima de imágenes a devolver")
150
  ):
151
  try:
152
  results = []
153
  with DDGS() as ddgs:
154
- # duckduckgo-search tiene una función nativa robusta para esto
155
- ddg_images = list(ddgs.images(q, max_results=max_results))
 
156
  for img in ddg_images:
157
- results.append({
158
- "title": img.get("title", ""),
159
- "image_url": img.get("image", ""), # URL de la imagen en alta resolución
160
- "thumbnail_url": img.get("thumbnail", ""), # URL de la miniatura (carga rápida)
161
- "source_url": img.get("url", ""), # URL de la página web donde está la imagen
162
- "source_name": img.get("source", "") # Nombre del sitio web
163
- })
 
 
 
 
 
 
 
 
 
 
 
 
164
  return {"results": results}
165
  except Exception as e:
166
- raise HTTPException(status_code=500, detail=f"Error al buscar imágenes: {str(e)}")
 
167
 
168
  if __name__ == "__main__":
169
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
4
  from bs4 import BeautifulSoup
5
  import uvicorn
6
  import urllib.parse
7
+ from duckduckgo_search import DDGS
8
 
9
+ app = FastAPI(title="Glimpse Scraping API", description="API de Scraping directo e Imágenes")
10
 
11
+ # Configuración CORS para evitar bloqueos del navegador
12
  app.add_middleware(
13
  CORSMiddleware,
14
  allow_origins=["*"],
 
17
  allow_headers=["*"],
18
  )
19
 
 
20
  HEADERS = {
21
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
22
  "Accept-Language": "es-ES,es;q=0.9,en;q=0.8"
23
  }
24
 
25
+ # --- FUNCIONES DE SCRAPING WEB ---
26
 
27
  def scrape_duckduckgo(query: str):
28
  url = f"https://html.duckduckgo.com/html/?q={urllib.parse.quote(query)}"
 
34
  for result in soup.select('.result__body'):
35
  title_tag = result.select_one('.result__title a')
36
  snippet_tag = result.select_one('.result__snippet')
 
37
  if title_tag:
38
  raw_url = title_tag.get('href', '')
39
  clean_url = urllib.parse.unquote(raw_url.replace('//duckduckgo.com/l/?uddg=', '').split('&')[0])
 
40
  results.append({
41
  "title": title_tag.text.strip(),
42
  "url": clean_url if clean_url.startswith('http') else raw_url,
 
49
  resp = requests.get(url, headers=HEADERS, timeout=10)
50
  resp.raise_for_status()
51
  soup = BeautifulSoup(resp.text, 'lxml')
 
52
  results = []
53
  for li in soup.select('ul.results-standard > li'):
54
  a_tag = li.select_one('a.ob')
55
  p_tag = li.select_one('p.s')
 
56
  if a_tag:
57
  results.append({
58
  "title": a_tag.text.strip(),
 
66
  resp = requests.get(url, headers=HEADERS, timeout=10)
67
  resp.raise_for_status()
68
  soup = BeautifulSoup(resp.text, 'lxml')
 
69
  results = []
70
  for article in soup.select('article.result'):
71
  title_tag = article.select_one('h2 a')
72
  snippet_tag = article.select_one('.result-snippet')
 
73
  if title_tag:
74
  results.append({
75
  "title": title_tag.text.strip(),
 
83
  resp = requests.get(url, headers=HEADERS, timeout=10)
84
  resp.raise_for_status()
85
  soup = BeautifulSoup(resp.text, 'lxml')
 
86
  results = []
87
  for snippet in soup.select('.snippet'):
88
  title_tag = snippet.select_one('.heading')
89
  link_tag = snippet.select_one('a')
90
  desc_tag = snippet.select_one('.snippet-content, .snippet-description')
 
91
  if title_tag and link_tag:
92
  results.append({
93
  "title": title_tag.text.strip(),
 
96
  })
97
  return results
98
 
99
+ # --- ENDPOINTS ---
100
 
101
  @app.get("/")
102
  def read_root():
103
+ return {"status": "Glimpse API is running!", "endpoints": ["/search", "/images"]}
104
 
105
  @app.get("/search")
106
  def search(
107
  q: str = Query(..., description="Término de búsqueda"),
108
+ engine: str = Query("duckduckgo", description="Motor de búsqueda")
109
  ):
110
  engine = engine.lower()
111
  results = []
 
112
  try:
113
+ if engine == "mojeek":
 
 
114
  results = scrape_mojeek(q)
115
  elif engine in ["qwant", "tapnav"]:
116
  results = scrape_qwant(q)
117
  elif engine == "brave":
118
  results = scrape_brave(q)
 
 
119
  else:
120
+ results = scrape_duckduckgo(q) # Duckduckgo o Startpage (como fallback)
 
 
 
 
 
121
  except Exception as e:
122
+ raise HTTPException(status_code=500, detail=f"Error en {engine}: {str(e)}")
 
123
  return {"results": results}
124
 
 
 
125
  @app.get("/images")
126
  def search_images(
127
  q: str = Query(..., description="Término de búsqueda para imágenes"),
128
+ max_results: int = Query(40, description="Cantidad máxima de imágenes")
129
  ):
130
  try:
131
  results = []
132
  with DDGS() as ddgs:
133
+ # Forzamos los parámetros correctos para la librería
134
+ ddg_images = list(ddgs.images(keywords=q, max_results=max_results))
135
+
136
  for img in ddg_images:
137
+ # Extraemos de forma segura, garantizando compatibilidad con el JS actual
138
+ img_url = img.get("image", "") or img.get("url", "")
139
+ thumb_url = img.get("thumbnail", "") or img_url
140
+ source_link = img.get("url", "")
141
+
142
+ if img_url: # Solo añadimos la imagen si el enlace existe
143
+ results.append({
144
+ # Variables que necesita tu JS actual:
145
+ "title": img.get("title", "Imagen"),
146
+ "image_url": img_url,
147
+ "thumbnail_url": thumb_url,
148
+ "source_url": source_link,
149
+ "source_name": img.get("source", ""),
150
+
151
+ # Variables de compatibilidad clásica por si acaso:
152
+ "url": img_url,
153
+ "content": f"Fuente: {img.get('source', '')}"
154
+ })
155
+
156
  return {"results": results}
157
  except Exception as e:
158
+ print(f"Error interno buscando imágenes: {str(e)}")
159
+ raise HTTPException(status_code=500, detail=f"Fallo al obtener imágenes: {str(e)}")
160
 
161
  if __name__ == "__main__":
162
  uvicorn.run(app, host="0.0.0.0", port=7860)