Spaces:

NexusInstruments
/

OmniscientIRIS

Sleeping

App Files Files Community

OmniscientIRIS / app.py

NexusInstruments

Update app.py

084a8af verified 3 months ago

raw

history blame contribute delete

4 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from duckduckgo_search import DDGS
	from sentence_transformers import SentenceTransformer
	from transformers import pipeline
	import numpy as np
	import re
	from sklearn.metrics.pairwise import cosine_similarity

	# -------------------------
	# Load Models (HF Safe)
	# -------------------------

	embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

	llm = pipeline(
	"text-generation",
	model="HuggingFaceH4/zephyr-7b-beta",
	max_new_tokens=512,
	)

	# -------------------------
	# Simple In-Memory Vector Store
	# -------------------------

	documents = []
	embeddings = []

	def add_to_rag(text):
	global documents, embeddings
	documents.append(text)
	embeddings.append(embedder.encode(text))

	def retrieve_from_rag(query, top_k=3):
	if not embeddings:
	return []
	query_emb = embedder.encode(query)
	sims = cosine_similarity([query_emb], embeddings)[0]
	top_idx = np.argsort(sims)[-top_k:][::-1]
	return [documents[i] for i in top_idx]

	# -------------------------
	# URL Scraper
	# -------------------------

	def scrape_url(url):
	try:
	r = requests.get(url, timeout=10)
	soup = BeautifulSoup(r.text, "html.parser")
	text = soup.get_text(separator=" ")
	add_to_rag(text)
	return "Scraped and added to RAG."
	except:
	return "Scrape failed."

	# -------------------------
	# DuckDuckGo Search
	# -------------------------

	def ddg_search(query):
	results = []
	with DDGS() as ddgs:
	for r in ddgs.text(query, max_results=5):
	results.append(r["body"])
	combined = "\n".join(results)
	add_to_rag(combined)
	return combined

	# -------------------------
	# Hybrid Entity Extraction
	# -------------------------

	def regex_entities(text):
	entities = {
	"emails": re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text),
	"phones": re.findall(r"\+?\d[\d -]{8,}\d", text),
	"urls": re.findall(r"https?://\S+", text),
	"ips": re.findall(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", text),
	}
	return entities

	def llm_refine_entities(text):
	prompt = f"""
	Extract structured OSINT entities from this text.
	Return JSON with:
	people, organizations, locations, vehicles, usernames.

	TEXT:
	{text}
	"""
	output = llm(prompt)[0]["generated_text"]
	return output

	def hybrid_extract(text):
	base = regex_entities(text)
	refined = llm_refine_entities(text[:2000])
	return f"Regex Extracted:\n{base}\n\nLLM Refined:\n{refined}"

	# -------------------------
	# Chat Logic
	# -------------------------

	def chat(query, use_web, use_rag):
	context = ""

	if use_web:
	context += ddg_search(query)

	if use_rag:
	docs = retrieve_from_rag(query)
	context += "\n".join(docs)

	final_prompt = f"""
	Use the following context to answer intelligently:

	{context}

	Question: {query}
	"""

	response = llm(final_prompt)[0]["generated_text"]
	return response

	# -------------------------
	# Gradio UI
	# -------------------------

	with gr.Blocks() as demo:
	gr.Markdown("# 🔎 Hybrid OSINT AI Assistant")

	with gr.Row():
	query = gr.Textbox(label="Ask Question")
	use_web = gr.Checkbox(label="Use DuckDuckGo Search")
	use_rag = gr.Checkbox(label="Use RAG")

	chat_btn = gr.Button("Run")

	output = gr.Textbox(label="Response")

	chat_btn.click(chat, inputs=[query, use_web, use_rag], outputs=output)

	gr.Markdown("## 🌐 URL → RAG")
	url_input = gr.Textbox(label="Enter URL")
	scrape_btn = gr.Button("Scrape")
	scrape_output = gr.Textbox()
	scrape_btn.click(scrape_url, inputs=url_input, outputs=scrape_output)

	gr.Markdown("## 🧩 OSINT Entity Extraction")
	extract_input = gr.Textbox(label="Paste Text")
	extract_btn = gr.Button("Extract Entities")
	extract_output = gr.Textbox()
	extract_btn.click(hybrid_extract, inputs=extract_input, outputs=extract_output)

	demo.launch()