oopshnik commited on
Commit
ffce262
·
verified ·
1 Parent(s): d98b77d

Upload 9 files

Browse files
Files changed (9) hide show
  1. .gitignore +10 -0
  2. .python-version +1 -0
  3. Dockerfile +39 -0
  4. README.md +0 -10
  5. main.py +55 -0
  6. pyproject.toml +15 -0
  7. scrape.py +35 -0
  8. search.py +58 -0
  9. uv.lock +0 -0
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.13
Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python image
2
+ FROM python:3.11-slim
3
+
4
+ # Set work directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ libglib2.0-0 \
11
+ libnss3 \
12
+ libgconf-2-4 \
13
+ libfontconfig1 \
14
+ libxss1 \
15
+ libasound2 \
16
+ libatk1.0-0 \
17
+ libatk-bridge2.0-0 \
18
+ libcups2 \
19
+ libdrm2 \
20
+ libxcomposite1 \
21
+ libxdamage1 \
22
+ libxrandr2 \
23
+ libgbm1 \
24
+ wget \
25
+ && rm -rf /var/lib/apt/lists/*
26
+
27
+ # Copy project files
28
+ COPY . /app
29
+
30
+ # Install Python dependencies
31
+ RUN pip install --upgrade pip && \
32
+ pip install --no-cache-dir . && \
33
+ pip install --no-cache-dir fastapi uvicorn[standard] patchright markdownify beautifulsoup4 readability-lxml ddgs
34
+
35
+ # Expose port
36
+ EXPOSE 8000
37
+
38
+ # Run the FastAPI server
39
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
README.md CHANGED
@@ -1,10 +0,0 @@
1
- ---
2
- title: Api
3
- emoji: 👁
4
- colorFrom: red
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
main.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Query
2
+ from scrape import scrape, sanitize, convert
3
+ from search import search, format as format_results, extract_urls, images, videos
4
+ import asyncio
5
+ from fastapi.responses import JSONResponse
6
+
7
+ app = FastAPI()
8
+
9
+ @app.get("/scrape")
10
+ async def scrape_endpoint(url: str = Query('https://example.com')):
11
+ html = await sanitize(await scrape(url))
12
+ md = await convert(html)
13
+ return JSONResponse(content={"markdown": md})
14
+
15
+ @app.post("/sanitize")
16
+ async def sanitize_endpoint(content: str):
17
+ sanitized = await sanitize(content)
18
+ return {"sanitized": sanitized}
19
+
20
+ @app.post("/convert")
21
+ async def convert_endpoint(content: str):
22
+ markdown = await convert(content)
23
+ return {"markdown": markdown}
24
+
25
+ @app.get("/search")
26
+ def search_endpoint(prompt: str, page: int = 1, region: str = 'us-en', safesearch: str = 'off', timelimit: str = 'y'):
27
+ results = search(prompt, page, region, safesearch, timelimit)
28
+ return {"results": list(results)}
29
+
30
+ @app.get("/images")
31
+ def images_endpoint(prompt: str, page: int = 1, region: str = 'us-en', safesearch: str = 'off', timelimit: str = 'y'):
32
+ results = images(prompt, page, region, safesearch, timelimit)
33
+ return {"results": list(results)}
34
+
35
+ @app.get("/videos")
36
+ def videos_endpoint(prompt: str, page: int = 1, region: str = 'us-en', safesearch: str = 'off', timelimit: str = 'y'):
37
+ results = videos(prompt, page, region, safesearch, timelimit)
38
+ return {"results": list(results)}
39
+
40
+ @app.post("/format")
41
+ def format_endpoint(results: list, prompt: str):
42
+ formatted = format_results(results, prompt)
43
+ return {"formatted": formatted}
44
+
45
+ @app.post("/extract_urls")
46
+ def extract_urls_endpoint(results: list):
47
+ urls = extract_urls(results)
48
+ return {"urls": urls}
49
+
50
+ def run():
51
+ import uvicorn
52
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
53
+
54
+ if __name__ == "__main__":
55
+ run()
pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "tera"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ dependencies = [
8
+ "beautifulsoup4>=4.14.3",
9
+ "patchright>=1.20.0",
10
+ "httpx>=0.28.1",
11
+ "markdownify>=1.2.2",
12
+ "readability-lxml>=0.8.4.1",
13
+ "ddgs>=1.0.0",
14
+ "fastapi[standard]",
15
+ ]
scrape.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from patchright.async_api import async_playwright
3
+ from markdownify import markdownify as md
4
+ from bs4 import BeautifulSoup
5
+ from readability import Document
6
+
7
+ async def scrape(url):
8
+ async with async_playwright() as p:
9
+ browser = await p.chromium.launch(headless=True)
10
+
11
+ context = await browser.new_context(
12
+ viewport={'width': 1920, 'height': 1080},
13
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
14
+ )
15
+
16
+ page = await context.new_page()
17
+
18
+ await page.route("**/*", lambda route: route.abort()
19
+ if route.request.resource_type in ["image", "media", "font", "stylesheet", "other"]
20
+ else route.continue_())
21
+
22
+ try:
23
+ await page.goto(url, wait_until='domcontentloaded', timeout=30000)
24
+ content = await page.content()
25
+ finally:
26
+ await browser.close()
27
+
28
+ return content
29
+
30
+ async def sanitize(content):
31
+ tree = BeautifulSoup(content, "html.parser")
32
+ return tree.prettify()
33
+
34
+ async def convert(content):
35
+ return md(content)
search.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ddgs import DDGS
2
+
3
+
4
+ def search(prompt, page, region='us-en', safesearch='off', timelimit='y'):
5
+ results = DDGS().text(
6
+ prompt, region=region,
7
+ safesearch=safesearch, timelimit=timelimit, page=page, backend="auto"
8
+ )
9
+ return results
10
+
11
+ def images(prompt, page, region='us-en', safesearch='off', timelimit='y'):
12
+ results = DDGS().images(
13
+ prompt, region=region,
14
+ safesearch=safesearch, timelimit=timelimit, page=page, backend="auto"
15
+ )
16
+ return results
17
+
18
+ def videos(prompt, page, region='us-en', safesearch='off', timelimit='y'):
19
+ results = DDGS().videos(
20
+ prompt, region=region,
21
+ safesearch=safesearch, timelimit=timelimit, page=page, backend="auto"
22
+ )
23
+ return results
24
+
25
+ def format(results, prompt):
26
+ formatted = []
27
+
28
+ formatted.append(f"Query: {prompt}")
29
+ for i, result in enumerate(results, 1):
30
+ formatted.append(f"### {i}. {result['title']}")
31
+ formatted.append(f"**Link:** [{result['title']}]({result['href']})")
32
+ formatted.append("")
33
+ formatted.append(result['body'].strip())
34
+ formatted.append("-" * 80)
35
+ formatted.append("")
36
+
37
+ return "\n".join(formatted)
38
+
39
+ def extract_urls(results):
40
+ return [result['href'] for result in results]
41
+
42
+ async def main():
43
+ prompt = "chatgpt"
44
+ page = 1
45
+
46
+ text_results = search(prompt, page)
47
+ print(text_results)
48
+
49
+ formatted_results = format(text_results, prompt)
50
+ print(formatted_results)
51
+
52
+ urls = extract_urls(text_results)
53
+ print(urls)
54
+
55
+
56
+ if __name__ == "__main__":
57
+ import asyncio
58
+ asyncio.run(main())
uv.lock ADDED
The diff for this file is too large to render. See raw diff