File size: 2,305 Bytes
3d2b9c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# app.py
import gradio as gr
from scraper import fetch, extract
from nlp_pipeline import process_document, embed_text
from vector_store import SimpleVectorStore
import numpy as np
import time

# init store (summary embedding dim from model)
DIM = 384  # all-MiniLM-L6-v2 => 384
store = SimpleVectorStore(dim=DIM)

def crawl_and_index(url):
    html, final_url = fetch(url)
    if not html:
        return "fetch failed", None
    doc = extract(html, final_url)
    nlp = process_document(doc)
    # simple dedupe: search against store and check similarity
    qvec = nlp["embedding"]
    if store.index.ntotal > 0:
        hits = store.search(qvec, k=3)
        if hits and hits[0][0] > 0.90:  # very similar (cosine)
            return "duplicate/updated - skipped", hits[0][1]
    meta = {
        "url": doc["url"],
        "title": doc["title"],
        "summary": nlp["summary"],
        "entities": nlp["entities"],
        "provenance": nlp["provenance"],
        "publish_date": str(doc.get("publish_date")),
        "timestamp": time.time()
    }
    store.add(qvec, meta)
    return "indexed", meta

def semantic_search(query, k=5):
    qvec = embed_text(query)
    hits = store.search(qvec, k=k)
    out = []
    for score, meta in hits:
        out.append({
            "score": round(score, 4),
            "title": meta["title"],
            "summary": meta["summary"],
            "url": meta["url"],
            "publish_date": meta["publish_date"]
        })
    return out

with gr.Blocks() as demo:
    gr.Markdown("# NLP Web Scraper (HF Space demo)")
    with gr.Row():
        url_input = gr.Textbox(label="Seed URL", placeholder="https://example.com/article")
        crawl_btn = gr.Button("Crawl & Index")
    status = gr.Label()
    result_box = gr.JSON(label="Indexed document metadata")
    crawl_btn.click(crawl_and_index, inputs=url_input, outputs=[status, result_box])

    gr.Markdown("## Semantic Search")
    query = gr.Textbox(label="Query")
    k = gr.Slider(1, 10, value=5, step=1, label="Top K")
    search_btn = gr.Button("Search")
    search_results = gr.Dataframe(headers=["score","title","summary","url","publish_date"], datatype="json")
    search_btn.click(semantic_search, inputs=[query, k], outputs=search_results)

if __name__ == "__main__":
    demo.launch()