Upload 9 files
Browse files- .gitignore +10 -0
- .python-version +1 -0
- Dockerfile +39 -0
- README.md +0 -10
- main.py +55 -0
- pyproject.toml +15 -0
- scrape.py +35 -0
- search.py +58 -0
- uv.lock +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python-generated files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[oc]
|
| 4 |
+
build/
|
| 5 |
+
dist/
|
| 6 |
+
wheels/
|
| 7 |
+
*.egg-info
|
| 8 |
+
|
| 9 |
+
# Virtual environments
|
| 10 |
+
.venv
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.13
|
Dockerfile
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use official Python image
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# Set work directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
build-essential \
|
| 10 |
+
libglib2.0-0 \
|
| 11 |
+
libnss3 \
|
| 12 |
+
libgconf-2-4 \
|
| 13 |
+
libfontconfig1 \
|
| 14 |
+
libxss1 \
|
| 15 |
+
libasound2 \
|
| 16 |
+
libatk1.0-0 \
|
| 17 |
+
libatk-bridge2.0-0 \
|
| 18 |
+
libcups2 \
|
| 19 |
+
libdrm2 \
|
| 20 |
+
libxcomposite1 \
|
| 21 |
+
libxdamage1 \
|
| 22 |
+
libxrandr2 \
|
| 23 |
+
libgbm1 \
|
| 24 |
+
wget \
|
| 25 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 26 |
+
|
| 27 |
+
# Copy project files
|
| 28 |
+
COPY . /app
|
| 29 |
+
|
| 30 |
+
# Install Python dependencies
|
| 31 |
+
RUN pip install --upgrade pip && \
|
| 32 |
+
pip install --no-cache-dir . && \
|
| 33 |
+
pip install --no-cache-dir fastapi uvicorn[standard] patchright markdownify beautifulsoup4 readability-lxml ddgs
|
| 34 |
+
|
| 35 |
+
# Expose port
|
| 36 |
+
EXPOSE 8000
|
| 37 |
+
|
| 38 |
+
# Run the FastAPI server
|
| 39 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -1,10 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Api
|
| 3 |
-
emoji: 👁
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: purple
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
---
|
| 9 |
-
|
| 10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, Query
|
| 2 |
+
from scrape import scrape, sanitize, convert
|
| 3 |
+
from search import search, format as format_results, extract_urls, images, videos
|
| 4 |
+
import asyncio
|
| 5 |
+
from fastapi.responses import JSONResponse
|
| 6 |
+
|
| 7 |
+
app = FastAPI()
|
| 8 |
+
|
| 9 |
+
@app.get("/scrape")
|
| 10 |
+
async def scrape_endpoint(url: str = Query('https://example.com')):
|
| 11 |
+
html = await sanitize(await scrape(url))
|
| 12 |
+
md = await convert(html)
|
| 13 |
+
return JSONResponse(content={"markdown": md})
|
| 14 |
+
|
| 15 |
+
@app.post("/sanitize")
|
| 16 |
+
async def sanitize_endpoint(content: str):
|
| 17 |
+
sanitized = await sanitize(content)
|
| 18 |
+
return {"sanitized": sanitized}
|
| 19 |
+
|
| 20 |
+
@app.post("/convert")
|
| 21 |
+
async def convert_endpoint(content: str):
|
| 22 |
+
markdown = await convert(content)
|
| 23 |
+
return {"markdown": markdown}
|
| 24 |
+
|
| 25 |
+
@app.get("/search")
|
| 26 |
+
def search_endpoint(prompt: str, page: int = 1, region: str = 'us-en', safesearch: str = 'off', timelimit: str = 'y'):
|
| 27 |
+
results = search(prompt, page, region, safesearch, timelimit)
|
| 28 |
+
return {"results": list(results)}
|
| 29 |
+
|
| 30 |
+
@app.get("/images")
|
| 31 |
+
def images_endpoint(prompt: str, page: int = 1, region: str = 'us-en', safesearch: str = 'off', timelimit: str = 'y'):
|
| 32 |
+
results = images(prompt, page, region, safesearch, timelimit)
|
| 33 |
+
return {"results": list(results)}
|
| 34 |
+
|
| 35 |
+
@app.get("/videos")
|
| 36 |
+
def videos_endpoint(prompt: str, page: int = 1, region: str = 'us-en', safesearch: str = 'off', timelimit: str = 'y'):
|
| 37 |
+
results = videos(prompt, page, region, safesearch, timelimit)
|
| 38 |
+
return {"results": list(results)}
|
| 39 |
+
|
| 40 |
+
@app.post("/format")
|
| 41 |
+
def format_endpoint(results: list, prompt: str):
|
| 42 |
+
formatted = format_results(results, prompt)
|
| 43 |
+
return {"formatted": formatted}
|
| 44 |
+
|
| 45 |
+
@app.post("/extract_urls")
|
| 46 |
+
def extract_urls_endpoint(results: list):
|
| 47 |
+
urls = extract_urls(results)
|
| 48 |
+
return {"urls": urls}
|
| 49 |
+
|
| 50 |
+
def run():
|
| 51 |
+
import uvicorn
|
| 52 |
+
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
|
| 53 |
+
|
| 54 |
+
if __name__ == "__main__":
|
| 55 |
+
run()
|
pyproject.toml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "tera"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.13"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"beautifulsoup4>=4.14.3",
|
| 9 |
+
"patchright>=1.20.0",
|
| 10 |
+
"httpx>=0.28.1",
|
| 11 |
+
"markdownify>=1.2.2",
|
| 12 |
+
"readability-lxml>=0.8.4.1",
|
| 13 |
+
"ddgs>=1.0.0",
|
| 14 |
+
"fastapi[standard]",
|
| 15 |
+
]
|
scrape.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from patchright.async_api import async_playwright
|
| 3 |
+
from markdownify import markdownify as md
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
+
from readability import Document
|
| 6 |
+
|
| 7 |
+
async def scrape(url):
|
| 8 |
+
async with async_playwright() as p:
|
| 9 |
+
browser = await p.chromium.launch(headless=True)
|
| 10 |
+
|
| 11 |
+
context = await browser.new_context(
|
| 12 |
+
viewport={'width': 1920, 'height': 1080},
|
| 13 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
page = await context.new_page()
|
| 17 |
+
|
| 18 |
+
await page.route("**/*", lambda route: route.abort()
|
| 19 |
+
if route.request.resource_type in ["image", "media", "font", "stylesheet", "other"]
|
| 20 |
+
else route.continue_())
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
await page.goto(url, wait_until='domcontentloaded', timeout=30000)
|
| 24 |
+
content = await page.content()
|
| 25 |
+
finally:
|
| 26 |
+
await browser.close()
|
| 27 |
+
|
| 28 |
+
return content
|
| 29 |
+
|
| 30 |
+
async def sanitize(content):
|
| 31 |
+
tree = BeautifulSoup(content, "html.parser")
|
| 32 |
+
return tree.prettify()
|
| 33 |
+
|
| 34 |
+
async def convert(content):
|
| 35 |
+
return md(content)
|
search.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ddgs import DDGS
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def search(prompt, page, region='us-en', safesearch='off', timelimit='y'):
|
| 5 |
+
results = DDGS().text(
|
| 6 |
+
prompt, region=region,
|
| 7 |
+
safesearch=safesearch, timelimit=timelimit, page=page, backend="auto"
|
| 8 |
+
)
|
| 9 |
+
return results
|
| 10 |
+
|
| 11 |
+
def images(prompt, page, region='us-en', safesearch='off', timelimit='y'):
|
| 12 |
+
results = DDGS().images(
|
| 13 |
+
prompt, region=region,
|
| 14 |
+
safesearch=safesearch, timelimit=timelimit, page=page, backend="auto"
|
| 15 |
+
)
|
| 16 |
+
return results
|
| 17 |
+
|
| 18 |
+
def videos(prompt, page, region='us-en', safesearch='off', timelimit='y'):
|
| 19 |
+
results = DDGS().videos(
|
| 20 |
+
prompt, region=region,
|
| 21 |
+
safesearch=safesearch, timelimit=timelimit, page=page, backend="auto"
|
| 22 |
+
)
|
| 23 |
+
return results
|
| 24 |
+
|
| 25 |
+
def format(results, prompt):
|
| 26 |
+
formatted = []
|
| 27 |
+
|
| 28 |
+
formatted.append(f"Query: {prompt}")
|
| 29 |
+
for i, result in enumerate(results, 1):
|
| 30 |
+
formatted.append(f"### {i}. {result['title']}")
|
| 31 |
+
formatted.append(f"**Link:** [{result['title']}]({result['href']})")
|
| 32 |
+
formatted.append("")
|
| 33 |
+
formatted.append(result['body'].strip())
|
| 34 |
+
formatted.append("-" * 80)
|
| 35 |
+
formatted.append("")
|
| 36 |
+
|
| 37 |
+
return "\n".join(formatted)
|
| 38 |
+
|
| 39 |
+
def extract_urls(results):
|
| 40 |
+
return [result['href'] for result in results]
|
| 41 |
+
|
| 42 |
+
async def main():
|
| 43 |
+
prompt = "chatgpt"
|
| 44 |
+
page = 1
|
| 45 |
+
|
| 46 |
+
text_results = search(prompt, page)
|
| 47 |
+
print(text_results)
|
| 48 |
+
|
| 49 |
+
formatted_results = format(text_results, prompt)
|
| 50 |
+
print(formatted_results)
|
| 51 |
+
|
| 52 |
+
urls = extract_urls(text_results)
|
| 53 |
+
print(urls)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
import asyncio
|
| 58 |
+
asyncio.run(main())
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|