Spaces:

oopshnik
/

api

Build error

App Files Files Community

oopshnik commited on Dec 15, 2025

Commit

ffce262

verified ·

1 Parent(s): d98b77d

Upload 9 files

Browse files

Files changed (9) hide show

.gitignore +10 -0
.python-version +1 -0
Dockerfile +39 -0
README.md +0 -10
main.py +55 -0
pyproject.toml +15 -0
scrape.py +35 -0
search.py +58 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.13

Dockerfile ADDED Viewed

	@@ -0,0 +1,39 @@

+# Use official Python image
+FROM python:3.11-slim
+# Set work directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libglib2.0-0 \
+    libnss3 \
+    libgconf-2-4 \
+    libfontconfig1 \
+    libxss1 \
+    libasound2 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libcups2 \
+    libdrm2 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxrandr2 \
+    libgbm1 \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+# Copy project files
+COPY . /app
+# Install Python dependencies
+RUN pip install --upgrade pip && \
+    pip install --no-cache-dir . && \
+    pip install --no-cache-dir fastapi uvicorn[standard] patchright markdownify beautifulsoup4 readability-lxml ddgs
+# Expose port
+EXPOSE 8000
+# Run the FastAPI server
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

README.md CHANGED Viewed

@@ -1,10 +0,0 @@
----
-title: Api
-emoji: 👁
-colorFrom: red
-colorTo: purple
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

main.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from fastapi import FastAPI, Query
+from scrape import scrape, sanitize, convert
+from search import search, format as format_results, extract_urls, images, videos
+import asyncio
+from fastapi.responses import JSONResponse
+app = FastAPI()
+@app.get("/scrape")
+async def scrape_endpoint(url: str = Query('https://example.com')):
+    html = await sanitize(await scrape(url))
+    md = await convert(html)
+    return JSONResponse(content={"markdown": md})
+@app.post("/sanitize")
+async def sanitize_endpoint(content: str):
+    sanitized = await sanitize(content)
+    return {"sanitized": sanitized}
+@app.post("/convert")
+async def convert_endpoint(content: str):
+    markdown = await convert(content)
+    return {"markdown": markdown}
+@app.get("/search")
+def search_endpoint(prompt: str, page: int = 1, region: str = 'us-en', safesearch: str = 'off', timelimit: str = 'y'):
+    results = search(prompt, page, region, safesearch, timelimit)
+    return {"results": list(results)}
+@app.get("/images")
+def images_endpoint(prompt: str, page: int = 1, region: str = 'us-en', safesearch: str = 'off', timelimit: str = 'y'):
+    results = images(prompt, page, region, safesearch, timelimit)
+    return {"results": list(results)}
+@app.get("/videos")
+def videos_endpoint(prompt: str, page: int = 1, region: str = 'us-en', safesearch: str = 'off', timelimit: str = 'y'):
+    results = videos(prompt, page, region, safesearch, timelimit)
+    return {"results": list(results)}
+@app.post("/format")
+def format_endpoint(results: list, prompt: str):
+    formatted = format_results(results, prompt)
+    return {"formatted": formatted}
+@app.post("/extract_urls")
+def extract_urls_endpoint(results: list):
+    urls = extract_urls(results)
+    return {"urls": urls}
+def run():
+    import uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
+if __name__ == "__main__":
+    run()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+[project]
+name = "tera"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "beautifulsoup4>=4.14.3",
+    "patchright>=1.20.0",
+    "httpx>=0.28.1",
+    "markdownify>=1.2.2",
+    "readability-lxml>=0.8.4.1",
+    "ddgs>=1.0.0",
+    "fastapi[standard]",
+]

scrape.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import asyncio
+from patchright.async_api import async_playwright
+from markdownify import markdownify as md
+from bs4 import BeautifulSoup
+from readability import Document
+async def scrape(url):
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            viewport={'width': 1920, 'height': 1080},
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
+        )
+        page = await context.new_page()
+        await page.route("**/*", lambda route: route.abort()
+            if route.request.resource_type in ["image", "media", "font", "stylesheet", "other"]
+            else route.continue_())
+        try:
+            await page.goto(url, wait_until='domcontentloaded', timeout=30000)
+            content = await page.content()
+        finally:
+            await browser.close()
+        return content
+async def sanitize(content):
+    tree = BeautifulSoup(content, "html.parser")
+    return tree.prettify()
+async def convert(content):
+    return md(content)

search.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from ddgs import DDGS
+def search(prompt, page, region='us-en', safesearch='off', timelimit='y'):
+    results = DDGS().text(
+        prompt, region=region,
+        safesearch=safesearch, timelimit=timelimit, page=page, backend="auto"
+    )
+    return results
+def images(prompt, page, region='us-en', safesearch='off', timelimit='y'):
+    results = DDGS().images(
+        prompt, region=region,
+        safesearch=safesearch, timelimit=timelimit, page=page, backend="auto"
+    )
+    return results
+def videos(prompt, page, region='us-en', safesearch='off', timelimit='y'):
+    results = DDGS().videos(
+        prompt, region=region,
+        safesearch=safesearch, timelimit=timelimit, page=page, backend="auto"
+    )
+    return results
+def format(results, prompt):
+    formatted = []
+    formatted.append(f"Query: {prompt}")
+    for i, result in enumerate(results, 1):
+        formatted.append(f"### {i}. {result['title']}")
+        formatted.append(f"**Link:** [{result['title']}]({result['href']})")
+        formatted.append("")
+        formatted.append(result['body'].strip())
+        formatted.append("-" * 80)
+        formatted.append("")
+    return "\n".join(formatted)
+def extract_urls(results):
+    return [result['href'] for result in results]
+async def main():
+    prompt = "chatgpt"
+    page = 1
+    text_results = search(prompt, page)
+    print(text_results)
+    formatted_results = format(text_results, prompt)
+    print(formatted_results)
+    urls = extract_urls(text_results)
+    print(urls)
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff