Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from duckduckgo_search import DDGS | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import pipeline | |
| import numpy as np | |
| import re | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # ------------------------- | |
| # Load Models (HF Safe) | |
| # ------------------------- | |
| embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| llm = pipeline( | |
| "text-generation", | |
| model="HuggingFaceH4/zephyr-7b-beta", | |
| max_new_tokens=512, | |
| ) | |
| # ------------------------- | |
| # Simple In-Memory Vector Store | |
| # ------------------------- | |
| documents = [] | |
| embeddings = [] | |
| def add_to_rag(text): | |
| global documents, embeddings | |
| documents.append(text) | |
| embeddings.append(embedder.encode(text)) | |
| def retrieve_from_rag(query, top_k=3): | |
| if not embeddings: | |
| return [] | |
| query_emb = embedder.encode(query) | |
| sims = cosine_similarity([query_emb], embeddings)[0] | |
| top_idx = np.argsort(sims)[-top_k:][::-1] | |
| return [documents[i] for i in top_idx] | |
| # ------------------------- | |
| # URL Scraper | |
| # ------------------------- | |
| def scrape_url(url): | |
| try: | |
| r = requests.get(url, timeout=10) | |
| soup = BeautifulSoup(r.text, "html.parser") | |
| text = soup.get_text(separator=" ") | |
| add_to_rag(text) | |
| return "Scraped and added to RAG." | |
| except: | |
| return "Scrape failed." | |
| # ------------------------- | |
| # DuckDuckGo Search | |
| # ------------------------- | |
| def ddg_search(query): | |
| results = [] | |
| with DDGS() as ddgs: | |
| for r in ddgs.text(query, max_results=5): | |
| results.append(r["body"]) | |
| combined = "\n".join(results) | |
| add_to_rag(combined) | |
| return combined | |
| # ------------------------- | |
| # Hybrid Entity Extraction | |
| # ------------------------- | |
| def regex_entities(text): | |
| entities = { | |
| "emails": re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text), | |
| "phones": re.findall(r"\+?\d[\d -]{8,}\d", text), | |
| "urls": re.findall(r"https?://\S+", text), | |
| "ips": re.findall(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", text), | |
| } | |
| return entities | |
| def llm_refine_entities(text): | |
| prompt = f""" | |
| Extract structured OSINT entities from this text. | |
| Return JSON with: | |
| people, organizations, locations, vehicles, usernames. | |
| TEXT: | |
| {text} | |
| """ | |
| output = llm(prompt)[0]["generated_text"] | |
| return output | |
| def hybrid_extract(text): | |
| base = regex_entities(text) | |
| refined = llm_refine_entities(text[:2000]) | |
| return f"Regex Extracted:\n{base}\n\nLLM Refined:\n{refined}" | |
| # ------------------------- | |
| # Chat Logic | |
| # ------------------------- | |
| def chat(query, use_web, use_rag): | |
| context = "" | |
| if use_web: | |
| context += ddg_search(query) | |
| if use_rag: | |
| docs = retrieve_from_rag(query) | |
| context += "\n".join(docs) | |
| final_prompt = f""" | |
| Use the following context to answer intelligently: | |
| {context} | |
| Question: {query} | |
| """ | |
| response = llm(final_prompt)[0]["generated_text"] | |
| return response | |
| # ------------------------- | |
| # Gradio UI | |
| # ------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🔎 Hybrid OSINT AI Assistant") | |
| with gr.Row(): | |
| query = gr.Textbox(label="Ask Question") | |
| use_web = gr.Checkbox(label="Use DuckDuckGo Search") | |
| use_rag = gr.Checkbox(label="Use RAG") | |
| chat_btn = gr.Button("Run") | |
| output = gr.Textbox(label="Response") | |
| chat_btn.click(chat, inputs=[query, use_web, use_rag], outputs=output) | |
| gr.Markdown("## 🌐 URL → RAG") | |
| url_input = gr.Textbox(label="Enter URL") | |
| scrape_btn = gr.Button("Scrape") | |
| scrape_output = gr.Textbox() | |
| scrape_btn.click(scrape_url, inputs=url_input, outputs=scrape_output) | |
| gr.Markdown("## 🧩 OSINT Entity Extraction") | |
| extract_input = gr.Textbox(label="Paste Text") | |
| extract_btn = gr.Button("Extract Entities") | |
| extract_output = gr.Textbox() | |
| extract_btn.click(hybrid_extract, inputs=extract_input, outputs=extract_output) | |
| demo.launch() |