Spaces:

NexusInstruments
/

OmniscientIRIS

Sleeping

File size: 3,999 Bytes

a4d4da8
084a8af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81aedc5
084a8af
81aedc5
084a8af
 
 
 
 
4f6bb50
084a8af
 
 
 
 
4f6bb50
084a8af
 
 
 
4f6bb50
084a8af
 
 
23bd97d
084a8af
 
531fa6d
084a8af
 
531fa6d
084a8af
 
 
23bd97d
084a8af
 
ff2c847
084a8af
531fa6d
084a8af
 
4f6bb50
084a8af
 
4f6bb50
084a8af
 
 
4f6bb50
084a8af
 
28ed01f
084a8af
 
 
 
28ed01f
084a8af
531fa6d
084a8af
531fa6d
084a8af
531fa6d
084a8af
 
 
 
 
531fa6d
084a8af
 
 
 
 
531fa6d
084a8af

import gradio as gr
import requests
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity

# -------------------------
# Load Models (HF Safe)
# -------------------------

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

llm = pipeline(
    "text-generation",
    model="HuggingFaceH4/zephyr-7b-beta",
    max_new_tokens=512,
)

# -------------------------
# Simple In-Memory Vector Store
# -------------------------

documents = []
embeddings = []

def add_to_rag(text):
    global documents, embeddings
    documents.append(text)
    embeddings.append(embedder.encode(text))

def retrieve_from_rag(query, top_k=3):
    if not embeddings:
        return []
    query_emb = embedder.encode(query)
    sims = cosine_similarity([query_emb], embeddings)[0]
    top_idx = np.argsort(sims)[-top_k:][::-1]
    return [documents[i] for i in top_idx]

# -------------------------
# URL Scraper
# -------------------------

def scrape_url(url):
    try:
        r = requests.get(url, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")
        text = soup.get_text(separator=" ")
        add_to_rag(text)
        return "Scraped and added to RAG."
    except:
        return "Scrape failed."

# -------------------------
# DuckDuckGo Search
# -------------------------

def ddg_search(query):
    results = []
    with DDGS() as ddgs:
        for r in ddgs.text(query, max_results=5):
            results.append(r["body"])
    combined = "\n".join(results)
    add_to_rag(combined)
    return combined

# -------------------------
# Hybrid Entity Extraction
# -------------------------

def regex_entities(text):
    entities = {
        "emails": re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text),
        "phones": re.findall(r"\+?\d[\d -]{8,}\d", text),
        "urls": re.findall(r"https?://\S+", text),
        "ips": re.findall(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", text),
    }
    return entities

def llm_refine_entities(text):
    prompt = f"""
Extract structured OSINT entities from this text.
Return JSON with:
people, organizations, locations, vehicles, usernames.

TEXT:
{text}
"""
    output = llm(prompt)[0]["generated_text"]
    return output

def hybrid_extract(text):
    base = regex_entities(text)
    refined = llm_refine_entities(text[:2000])
    return f"Regex Extracted:\n{base}\n\nLLM Refined:\n{refined}"

# -------------------------
# Chat Logic
# -------------------------

def chat(query, use_web, use_rag):
    context = ""

    if use_web:
        context += ddg_search(query)

    if use_rag:
        docs = retrieve_from_rag(query)
        context += "\n".join(docs)

    final_prompt = f"""
Use the following context to answer intelligently:

{context}

Question: {query}
"""

    response = llm(final_prompt)[0]["generated_text"]
    return response

# -------------------------
# Gradio UI
# -------------------------

with gr.Blocks() as demo:
    gr.Markdown("# 🔎 Hybrid OSINT AI Assistant")

    with gr.Row():
        query = gr.Textbox(label="Ask Question")
        use_web = gr.Checkbox(label="Use DuckDuckGo Search")
        use_rag = gr.Checkbox(label="Use RAG")

    chat_btn = gr.Button("Run")

    output = gr.Textbox(label="Response")

    chat_btn.click(chat, inputs=[query, use_web, use_rag], outputs=output)

    gr.Markdown("## 🌐 URL → RAG")
    url_input = gr.Textbox(label="Enter URL")
    scrape_btn = gr.Button("Scrape")
    scrape_output = gr.Textbox()
    scrape_btn.click(scrape_url, inputs=url_input, outputs=scrape_output)

    gr.Markdown("## 🧩 OSINT Entity Extraction")
    extract_input = gr.Textbox(label="Paste Text")
    extract_btn = gr.Button("Extract Entities")
    extract_output = gr.Textbox()
    extract_btn.click(hybrid_extract, inputs=extract_input, outputs=extract_output)

demo.launch()