File size: 3,999 Bytes
a4d4da8
084a8af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81aedc5
084a8af
81aedc5
084a8af
 
 
 
 
4f6bb50
084a8af
 
 
 
 
4f6bb50
084a8af
 
 
 
4f6bb50
084a8af
 
 
23bd97d
084a8af
 
531fa6d
084a8af
 
531fa6d
084a8af
 
 
23bd97d
084a8af
 
ff2c847
084a8af
531fa6d
084a8af
 
4f6bb50
084a8af
 
4f6bb50
084a8af
 
 
4f6bb50
084a8af
 
28ed01f
084a8af
 
 
 
28ed01f
084a8af
531fa6d
084a8af
531fa6d
084a8af
531fa6d
084a8af
 
 
 
 
531fa6d
084a8af
 
 
 
 
531fa6d
084a8af
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import gradio as gr
import requests
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity

# -------------------------
# Load Models (HF Safe)
# -------------------------

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

llm = pipeline(
    "text-generation",
    model="HuggingFaceH4/zephyr-7b-beta",
    max_new_tokens=512,
)

# -------------------------
# Simple In-Memory Vector Store
# -------------------------

documents = []
embeddings = []

def add_to_rag(text):
    global documents, embeddings
    documents.append(text)
    embeddings.append(embedder.encode(text))

def retrieve_from_rag(query, top_k=3):
    if not embeddings:
        return []
    query_emb = embedder.encode(query)
    sims = cosine_similarity([query_emb], embeddings)[0]
    top_idx = np.argsort(sims)[-top_k:][::-1]
    return [documents[i] for i in top_idx]

# -------------------------
# URL Scraper
# -------------------------

def scrape_url(url):
    try:
        r = requests.get(url, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")
        text = soup.get_text(separator=" ")
        add_to_rag(text)
        return "Scraped and added to RAG."
    except:
        return "Scrape failed."

# -------------------------
# DuckDuckGo Search
# -------------------------

def ddg_search(query):
    results = []
    with DDGS() as ddgs:
        for r in ddgs.text(query, max_results=5):
            results.append(r["body"])
    combined = "\n".join(results)
    add_to_rag(combined)
    return combined

# -------------------------
# Hybrid Entity Extraction
# -------------------------

def regex_entities(text):
    entities = {
        "emails": re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text),
        "phones": re.findall(r"\+?\d[\d -]{8,}\d", text),
        "urls": re.findall(r"https?://\S+", text),
        "ips": re.findall(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", text),
    }
    return entities

def llm_refine_entities(text):
    prompt = f"""
Extract structured OSINT entities from this text.
Return JSON with:
people, organizations, locations, vehicles, usernames.

TEXT:
{text}
"""
    output = llm(prompt)[0]["generated_text"]
    return output

def hybrid_extract(text):
    base = regex_entities(text)
    refined = llm_refine_entities(text[:2000])
    return f"Regex Extracted:\n{base}\n\nLLM Refined:\n{refined}"

# -------------------------
# Chat Logic
# -------------------------

def chat(query, use_web, use_rag):
    context = ""

    if use_web:
        context += ddg_search(query)

    if use_rag:
        docs = retrieve_from_rag(query)
        context += "\n".join(docs)

    final_prompt = f"""
Use the following context to answer intelligently:

{context}

Question: {query}
"""

    response = llm(final_prompt)[0]["generated_text"]
    return response

# -------------------------
# Gradio UI
# -------------------------

with gr.Blocks() as demo:
    gr.Markdown("# 🔎 Hybrid OSINT AI Assistant")

    with gr.Row():
        query = gr.Textbox(label="Ask Question")
        use_web = gr.Checkbox(label="Use DuckDuckGo Search")
        use_rag = gr.Checkbox(label="Use RAG")

    chat_btn = gr.Button("Run")

    output = gr.Textbox(label="Response")

    chat_btn.click(chat, inputs=[query, use_web, use_rag], outputs=output)

    gr.Markdown("## 🌐 URL → RAG")
    url_input = gr.Textbox(label="Enter URL")
    scrape_btn = gr.Button("Scrape")
    scrape_output = gr.Textbox()
    scrape_btn.click(scrape_url, inputs=url_input, outputs=scrape_output)

    gr.Markdown("## 🧩 OSINT Entity Extraction")
    extract_input = gr.Textbox(label="Paste Text")
    extract_btn = gr.Button("Extract Entities")
    extract_output = gr.Textbox()
    extract_btn.click(hybrid_extract, inputs=extract_input, outputs=extract_output)

demo.launch()