Juanoto2012 commited on
Commit
8c2240b
verified
1 Parent(s): 2fd10af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -65
app.py CHANGED
@@ -1,93 +1,145 @@
1
  from fastapi import FastAPI, HTTPException, Query
2
  from fastapi.middleware.cors import CORSMiddleware
3
- from duckduckgo_search import DDGS
4
  import requests
 
5
  import uvicorn
 
6
 
7
- app = FastAPI(title="Glimpse Search API", description="API unificada para m煤ltiples motores de b煤squeda")
8
 
9
- # Habilitar CORS para que tu app frontend pueda consumir esta API sin errores
10
  app.add_middleware(
11
  CORSMiddleware,
12
- allow_origins=["*"], # En producci贸n, puedes cambiar "*" por el dominio de tu app
13
  allow_credentials=True,
14
  allow_methods=["*"],
15
  allow_headers=["*"],
16
  )
17
 
18
- # Servidores p煤blicos de SearXNG como fallback para los otros motores
19
- SEARXNG_INSTANCES = [
20
- "https://searx.be/search",
21
- "https://searx.fmac.network/search",
22
- "https://search.mdosch.de/search"
23
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  @app.get("/")
26
  def read_root():
27
- return {"status": "Glimpse API is running. Use the /search endpoint."}
28
 
29
  @app.get("/search")
30
  def search(
31
  q: str = Query(..., description="T茅rmino de b煤squeda"),
32
- engine: str = Query("duckduckgo", description="duckduckgo, brave, startpage, mojeek, qwant")
33
  ):
34
- results = []
35
  engine = engine.lower()
 
36
 
37
- # 1. B煤squeda nativa y r谩pida con DuckDuckGo
38
- if engine == "duckduckgo":
39
- try:
40
- with DDGS() as ddgs:
41
- # max_results controla cu谩ntos enlaces devuelves
42
- ddg_results = list(ddgs.text(q, max_results=15))
43
- for r in ddg_results:
44
- results.append({
45
- "title": r.get("title", ""),
46
- "url": r.get("href", ""),
47
- "content": r.get("body", "")
48
- })
49
- except Exception as e:
50
- raise HTTPException(status_code=500, detail=f"Error en DuckDuckGo: {str(e)}")
51
-
52
- # 2. B煤squeda con Brave, Startpage, Mojeek a trav茅s del puente de SearXNG
53
- else:
54
- # Si enviaste "tapnav", lo mapearemos a algo que Searxng entienda, como qwant
55
- if engine == "tapnav":
56
- engine = "qwant"
57
-
58
- success = False
59
- params = {
60
- "q": q,
61
- "format": "json",
62
- "engines": engine,
63
- "language": "es-ES"
64
- }
65
-
66
- # Intentar en nuestras instancias de fallback si alguna falla
67
- for instance in SEARXNG_INSTANCES:
68
- try:
69
- resp = requests.get(instance, params=params, timeout=5)
70
- resp.raise_for_status()
71
- data = resp.json()
72
-
73
- for r in data.get("results", []):
74
- results.append({
75
- "title": r.get("title", ""),
76
- "url": r.get("url", ""),
77
- "content": r.get("content", "")
78
- })
79
- success = True
80
- break # Si tuvo 茅xito, salimos del bucle
81
- except Exception as e:
82
- print(f"Fall贸 la instancia {instance} para el motor {engine}: {e}")
83
- continue
84
-
85
- if not success:
86
- raise HTTPException(status_code=503, detail=f"Todos los servidores proxy para el motor {engine} fallaron temporalmente.")
87
 
88
- # Devolvemos exactamente el formato que espera tu HTML
89
  return {"results": results}
90
 
91
  if __name__ == "__main__":
92
- # Hugging Face expone los puertos en el 7860 por defecto
93
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  from fastapi import FastAPI, HTTPException, Query
2
  from fastapi.middleware.cors import CORSMiddleware
 
3
  import requests
4
+ from bs4 import BeautifulSoup
5
  import uvicorn
6
+ import urllib.parse
7
 
8
+ app = FastAPI(title="Glimpse Scraping API", description="API de Scraping directo para buscadores")
9
 
 
10
  app.add_middleware(
11
  CORSMiddleware,
12
+ allow_origins=["*"],
13
  allow_credentials=True,
14
  allow_methods=["*"],
15
  allow_headers=["*"],
16
  )
17
 
18
+ # User-Agent de un navegador real para evitar bloqueos
19
+ HEADERS = {
20
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
21
+ "Accept-Language": "es-ES,es;q=0.9,en;q=0.8"
22
+ }
23
+
24
+ def scrape_duckduckgo(query: str):
25
+ # Usamos la versi贸n HTML plana de DDG que es m谩s f谩cil de scrapear
26
+ url = f"https://html.duckduckgo.com/html/?q={urllib.parse.quote(query)}"
27
+ resp = requests.get(url, headers=HEADERS, timeout=10)
28
+ resp.raise_for_status()
29
+ soup = BeautifulSoup(resp.text, 'lxml')
30
+
31
+ results = []
32
+ for result in soup.select('.result__body'):
33
+ title_tag = result.select_one('.result__title a')
34
+ snippet_tag = result.select_one('.result__snippet')
35
+
36
+ if title_tag:
37
+ # Limpiar la URL de redirecci贸n de DDG
38
+ raw_url = title_tag.get('href', '')
39
+ clean_url = urllib.parse.unquote(raw_url.replace('//duckduckgo.com/l/?uddg=', '').split('&')[0])
40
+
41
+ results.append({
42
+ "title": title_tag.text.strip(),
43
+ "url": clean_url if clean_url.startswith('http') else raw_url,
44
+ "content": snippet_tag.text.strip() if snippet_tag else ""
45
+ })
46
+ return results
47
+
48
+ def scrape_mojeek(query: str):
49
+ url = f"https://www.mojeek.com/search?q={urllib.parse.quote(query)}&fmt=html"
50
+ resp = requests.get(url, headers=HEADERS, timeout=10)
51
+ resp.raise_for_status()
52
+ soup = BeautifulSoup(resp.text, 'lxml')
53
+
54
+ results = []
55
+ for li in soup.select('ul.results-standard > li'):
56
+ a_tag = li.select_one('a.ob')
57
+ p_tag = li.select_one('p.s')
58
+
59
+ if a_tag:
60
+ results.append({
61
+ "title": a_tag.text.strip(),
62
+ "url": a_tag.get('href', ''),
63
+ "content": p_tag.text.strip() if p_tag else ""
64
+ })
65
+ return results
66
+
67
+ def scrape_qwant(query: str):
68
+ # Usamos Qwant Lite que no requiere renderizado de JavaScript
69
+ url = f"https://lite.qwant.com/?q={urllib.parse.quote(query)}"
70
+ resp = requests.get(url, headers=HEADERS, timeout=10)
71
+ resp.raise_for_status()
72
+ soup = BeautifulSoup(resp.text, 'lxml')
73
+
74
+ results = []
75
+ for article in soup.select('article.result'):
76
+ title_tag = article.select_one('h2 a')
77
+ snippet_tag = article.select_one('.result-snippet')
78
+
79
+ if title_tag:
80
+ results.append({
81
+ "title": title_tag.text.strip(),
82
+ "url": title_tag.get('href', ''),
83
+ "content": snippet_tag.text.strip() if snippet_tag else ""
84
+ })
85
+ return results
86
+
87
+ def scrape_brave(query: str):
88
+ url = f"https://search.brave.com/search?q={urllib.parse.quote(query)}"
89
+ resp = requests.get(url, headers=HEADERS, timeout=10)
90
+ resp.raise_for_status()
91
+ soup = BeautifulSoup(resp.text, 'lxml')
92
+
93
+ results = []
94
+ for snippet in soup.select('.snippet'):
95
+ title_tag = snippet.select_one('.heading')
96
+ link_tag = snippet.select_one('a')
97
+ desc_tag = snippet.select_one('.snippet-content, .snippet-description')
98
+
99
+ if title_tag and link_tag:
100
+ results.append({
101
+ "title": title_tag.text.strip(),
102
+ "url": link_tag.get('href', ''),
103
+ "content": desc_tag.text.strip() if desc_tag else ""
104
+ })
105
+ return results
106
 
107
  @app.get("/")
108
  def read_root():
109
+ return {"status": "Glimpse Scraping API is running!"}
110
 
111
  @app.get("/search")
112
  def search(
113
  q: str = Query(..., description="T茅rmino de b煤squeda"),
114
+ engine: str = Query("duckduckgo", description="duckduckgo, mojeek, qwant, brave")
115
  ):
 
116
  engine = engine.lower()
117
+ results = []
118
 
119
+ try:
120
+ if engine == "duckduckgo":
121
+ results = scrape_duckduckgo(q)
122
+ elif engine == "mojeek":
123
+ results = scrape_mojeek(q)
124
+ elif engine in ["qwant", "tapnav"]:
125
+ results = scrape_qwant(q)
126
+ elif engine == "brave":
127
+ results = scrape_brave(q)
128
+ elif engine == "startpage":
129
+ # Startpage bloquea el scraping crudo agresivamente (usa tokens).
130
+ # Redirigimos silenciosamente a DuckDuckGo como fallback
131
+ results = scrape_duckduckgo(q)
132
+ else:
133
+ results = scrape_duckduckgo(q)
134
+
135
+ except requests.exceptions.HTTPError as e:
136
+ if e.response.status_code == 403:
137
+ raise HTTPException(status_code=403, detail=f"El motor {engine} nos bloque贸 (Protecci贸n Anti-Bot/CORS).")
138
+ raise HTTPException(status_code=500, detail=str(e))
139
+ except Exception as e:
140
+ raise HTTPException(status_code=500, detail=f"Error al procesar el HTML de {engine}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
 
142
  return {"results": results}
143
 
144
  if __name__ == "__main__":
 
145
  uvicorn.run(app, host="0.0.0.0", port=7860)