import gradio as gr import requests from bs4 import BeautifulSoup from duckduckgo_search import DDGS from sentence_transformers import SentenceTransformer from transformers import pipeline import numpy as np import re from sklearn.metrics.pairwise import cosine_similarity # ------------------------- # Load Models (HF Safe) # ------------------------- embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") llm = pipeline( "text-generation", model="HuggingFaceH4/zephyr-7b-beta", max_new_tokens=512, ) # ------------------------- # Simple In-Memory Vector Store # ------------------------- documents = [] embeddings = [] def add_to_rag(text): global documents, embeddings documents.append(text) embeddings.append(embedder.encode(text)) def retrieve_from_rag(query, top_k=3): if not embeddings: return [] query_emb = embedder.encode(query) sims = cosine_similarity([query_emb], embeddings)[0] top_idx = np.argsort(sims)[-top_k:][::-1] return [documents[i] for i in top_idx] # ------------------------- # URL Scraper # ------------------------- def scrape_url(url): try: r = requests.get(url, timeout=10) soup = BeautifulSoup(r.text, "html.parser") text = soup.get_text(separator=" ") add_to_rag(text) return "Scraped and added to RAG." except: return "Scrape failed." # ------------------------- # DuckDuckGo Search # ------------------------- def ddg_search(query): results = [] with DDGS() as ddgs: for r in ddgs.text(query, max_results=5): results.append(r["body"]) combined = "\n".join(results) add_to_rag(combined) return combined # ------------------------- # Hybrid Entity Extraction # ------------------------- def regex_entities(text): entities = { "emails": re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text), "phones": re.findall(r"\+?\d[\d -]{8,}\d", text), "urls": re.findall(r"https?://\S+", text), "ips": re.findall(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", text), } return entities def llm_refine_entities(text): prompt = f""" Extract structured OSINT entities from this text. Return JSON with: people, organizations, locations, vehicles, usernames. TEXT: {text} """ output = llm(prompt)[0]["generated_text"] return output def hybrid_extract(text): base = regex_entities(text) refined = llm_refine_entities(text[:2000]) return f"Regex Extracted:\n{base}\n\nLLM Refined:\n{refined}" # ------------------------- # Chat Logic # ------------------------- def chat(query, use_web, use_rag): context = "" if use_web: context += ddg_search(query) if use_rag: docs = retrieve_from_rag(query) context += "\n".join(docs) final_prompt = f""" Use the following context to answer intelligently: {context} Question: {query} """ response = llm(final_prompt)[0]["generated_text"] return response # ------------------------- # Gradio UI # ------------------------- with gr.Blocks() as demo: gr.Markdown("# 🔎 Hybrid OSINT AI Assistant") with gr.Row(): query = gr.Textbox(label="Ask Question") use_web = gr.Checkbox(label="Use DuckDuckGo Search") use_rag = gr.Checkbox(label="Use RAG") chat_btn = gr.Button("Run") output = gr.Textbox(label="Response") chat_btn.click(chat, inputs=[query, use_web, use_rag], outputs=output) gr.Markdown("## 🌐 URL → RAG") url_input = gr.Textbox(label="Enter URL") scrape_btn = gr.Button("Scrape") scrape_output = gr.Textbox() scrape_btn.click(scrape_url, inputs=url_input, outputs=scrape_output) gr.Markdown("## 🧩 OSINT Entity Extraction") extract_input = gr.Textbox(label="Paste Text") extract_btn = gr.Button("Extract Entities") extract_output = gr.Textbox() extract_btn.click(hybrid_extract, inputs=extract_input, outputs=extract_output) demo.launch()