OmniscientIRIS / app.py
NexusInstruments's picture
Update app.py
084a8af verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
# -------------------------
# Load Models (HF Safe)
# -------------------------
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
llm = pipeline(
"text-generation",
model="HuggingFaceH4/zephyr-7b-beta",
max_new_tokens=512,
)
# -------------------------
# Simple In-Memory Vector Store
# -------------------------
documents = []
embeddings = []
def add_to_rag(text):
global documents, embeddings
documents.append(text)
embeddings.append(embedder.encode(text))
def retrieve_from_rag(query, top_k=3):
if not embeddings:
return []
query_emb = embedder.encode(query)
sims = cosine_similarity([query_emb], embeddings)[0]
top_idx = np.argsort(sims)[-top_k:][::-1]
return [documents[i] for i in top_idx]
# -------------------------
# URL Scraper
# -------------------------
def scrape_url(url):
try:
r = requests.get(url, timeout=10)
soup = BeautifulSoup(r.text, "html.parser")
text = soup.get_text(separator=" ")
add_to_rag(text)
return "Scraped and added to RAG."
except:
return "Scrape failed."
# -------------------------
# DuckDuckGo Search
# -------------------------
def ddg_search(query):
results = []
with DDGS() as ddgs:
for r in ddgs.text(query, max_results=5):
results.append(r["body"])
combined = "\n".join(results)
add_to_rag(combined)
return combined
# -------------------------
# Hybrid Entity Extraction
# -------------------------
def regex_entities(text):
entities = {
"emails": re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text),
"phones": re.findall(r"\+?\d[\d -]{8,}\d", text),
"urls": re.findall(r"https?://\S+", text),
"ips": re.findall(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", text),
}
return entities
def llm_refine_entities(text):
prompt = f"""
Extract structured OSINT entities from this text.
Return JSON with:
people, organizations, locations, vehicles, usernames.
TEXT:
{text}
"""
output = llm(prompt)[0]["generated_text"]
return output
def hybrid_extract(text):
base = regex_entities(text)
refined = llm_refine_entities(text[:2000])
return f"Regex Extracted:\n{base}\n\nLLM Refined:\n{refined}"
# -------------------------
# Chat Logic
# -------------------------
def chat(query, use_web, use_rag):
context = ""
if use_web:
context += ddg_search(query)
if use_rag:
docs = retrieve_from_rag(query)
context += "\n".join(docs)
final_prompt = f"""
Use the following context to answer intelligently:
{context}
Question: {query}
"""
response = llm(final_prompt)[0]["generated_text"]
return response
# -------------------------
# Gradio UI
# -------------------------
with gr.Blocks() as demo:
gr.Markdown("# 🔎 Hybrid OSINT AI Assistant")
with gr.Row():
query = gr.Textbox(label="Ask Question")
use_web = gr.Checkbox(label="Use DuckDuckGo Search")
use_rag = gr.Checkbox(label="Use RAG")
chat_btn = gr.Button("Run")
output = gr.Textbox(label="Response")
chat_btn.click(chat, inputs=[query, use_web, use_rag], outputs=output)
gr.Markdown("## 🌐 URL → RAG")
url_input = gr.Textbox(label="Enter URL")
scrape_btn = gr.Button("Scrape")
scrape_output = gr.Textbox()
scrape_btn.click(scrape_url, inputs=url_input, outputs=scrape_output)
gr.Markdown("## 🧩 OSINT Entity Extraction")
extract_input = gr.Textbox(label="Paste Text")
extract_btn = gr.Button("Extract Entities")
extract_output = gr.Textbox()
extract_btn.click(hybrid_extract, inputs=extract_input, outputs=extract_output)
demo.launch()