kuldeep0204 commited on
Commit
3d2b9c6
·
verified ·
1 Parent(s): 66c1ee1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ from scraper import fetch, extract
4
+ from nlp_pipeline import process_document, embed_text
5
+ from vector_store import SimpleVectorStore
6
+ import numpy as np
7
+ import time
8
+
9
+ # init store (summary embedding dim from model)
10
+ DIM = 384 # all-MiniLM-L6-v2 => 384
11
+ store = SimpleVectorStore(dim=DIM)
12
+
13
+ def crawl_and_index(url):
14
+ html, final_url = fetch(url)
15
+ if not html:
16
+ return "fetch failed", None
17
+ doc = extract(html, final_url)
18
+ nlp = process_document(doc)
19
+ # simple dedupe: search against store and check similarity
20
+ qvec = nlp["embedding"]
21
+ if store.index.ntotal > 0:
22
+ hits = store.search(qvec, k=3)
23
+ if hits and hits[0][0] > 0.90: # very similar (cosine)
24
+ return "duplicate/updated - skipped", hits[0][1]
25
+ meta = {
26
+ "url": doc["url"],
27
+ "title": doc["title"],
28
+ "summary": nlp["summary"],
29
+ "entities": nlp["entities"],
30
+ "provenance": nlp["provenance"],
31
+ "publish_date": str(doc.get("publish_date")),
32
+ "timestamp": time.time()
33
+ }
34
+ store.add(qvec, meta)
35
+ return "indexed", meta
36
+
37
+ def semantic_search(query, k=5):
38
+ qvec = embed_text(query)
39
+ hits = store.search(qvec, k=k)
40
+ out = []
41
+ for score, meta in hits:
42
+ out.append({
43
+ "score": round(score, 4),
44
+ "title": meta["title"],
45
+ "summary": meta["summary"],
46
+ "url": meta["url"],
47
+ "publish_date": meta["publish_date"]
48
+ })
49
+ return out
50
+
51
+ with gr.Blocks() as demo:
52
+ gr.Markdown("# NLP Web Scraper (HF Space demo)")
53
+ with gr.Row():
54
+ url_input = gr.Textbox(label="Seed URL", placeholder="https://example.com/article")
55
+ crawl_btn = gr.Button("Crawl & Index")
56
+ status = gr.Label()
57
+ result_box = gr.JSON(label="Indexed document metadata")
58
+ crawl_btn.click(crawl_and_index, inputs=url_input, outputs=[status, result_box])
59
+
60
+ gr.Markdown("## Semantic Search")
61
+ query = gr.Textbox(label="Query")
62
+ k = gr.Slider(1, 10, value=5, step=1, label="Top K")
63
+ search_btn = gr.Button("Search")
64
+ search_results = gr.Dataframe(headers=["score","title","summary","url","publish_date"], datatype="json")
65
+ search_btn.click(semantic_search, inputs=[query, k], outputs=search_results)
66
+
67
+ if __name__ == "__main__":
68
+ demo.launch()