Spaces:
Running
Running
Emre Sarigöl commited on
Commit ·
d3a246e
1
Parent(s): a02dc64
Deploy GURMA.ai Dashboard - 2026-02-18 14:15
Browse files- Dockerfile +1 -1
- app.py +267 -256
- cli.py +340 -0
- config.py +99 -0
- extract.py +537 -0
- intel.py +508 -0
- llm.py +154 -0
- research.py +61 -1913
- search.py +305 -0
- sota_agent.py +850 -0
- tr_agents.py +480 -0
- tr_tab.py +218 -0
Dockerfile
CHANGED
|
@@ -23,7 +23,7 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
| 23 |
COPY . .
|
| 24 |
|
| 25 |
# Create data directories
|
| 26 |
-
RUN mkdir -p data/intel
|
| 27 |
|
| 28 |
# Expose Streamlit port (HF Spaces expects app_port from README.md)
|
| 29 |
EXPOSE 8501
|
|
|
|
| 23 |
COPY . .
|
| 24 |
|
| 25 |
# Create data directories
|
| 26 |
+
RUN mkdir -p data/intel data/tr-mali data/tr-fonlar docs
|
| 27 |
|
| 28 |
# Expose Streamlit port (HF Spaces expects app_port from README.md)
|
| 29 |
EXPOSE 8501
|
app.py
CHANGED
|
@@ -25,7 +25,7 @@ import pandas as pd
|
|
| 25 |
IS_HF_SPACE = os.getenv("HF_SPACE") or Path("/app/research.py").exists()
|
| 26 |
|
| 27 |
if IS_HF_SPACE:
|
| 28 |
-
# HF Space:
|
| 29 |
from research import (
|
| 30 |
SearchService,
|
| 31 |
CompetitorExtractor,
|
|
@@ -41,9 +41,9 @@ if IS_HF_SPACE:
|
|
| 41 |
LLM_ENABLED,
|
| 42 |
)
|
| 43 |
else:
|
| 44 |
-
# Local:
|
| 45 |
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
| 46 |
-
from src.utils
|
| 47 |
SearchService,
|
| 48 |
CompetitorExtractor,
|
| 49 |
CompetitorIntelAgent,
|
|
@@ -832,225 +832,18 @@ def export_html(data: dict, research: list, date_range: str = "All time") -> str
|
|
| 832 |
|
| 833 |
|
| 834 |
# ============================================================
|
| 835 |
-
#
|
| 836 |
# ============================================================
|
| 837 |
|
| 838 |
-
def main():
|
| 839 |
-
# Check access
|
| 840 |
-
if not check_access():
|
| 841 |
-
show_login_page()
|
| 842 |
-
return
|
| 843 |
-
|
| 844 |
-
# On HF Space, optionally hydrate runtime data from a private dataset repo.
|
| 845 |
-
sync_status = sync_private_data_if_configured()
|
| 846 |
-
if sync_status.get("status") == "error":
|
| 847 |
-
st.error(f"Private data sync failed: {sync_status.get('reason', 'unknown error')}")
|
| 848 |
-
return
|
| 849 |
-
|
| 850 |
-
# --- Page Navigation ---
|
| 851 |
-
page = st.sidebar.radio(
|
| 852 |
-
"Navigation",
|
| 853 |
-
["Competitive Intel", "Model Evaluation"],
|
| 854 |
-
index=0,
|
| 855 |
-
key="nav_page",
|
| 856 |
-
)
|
| 857 |
-
|
| 858 |
-
if page == "Model Evaluation":
|
| 859 |
-
if IS_HF_SPACE:
|
| 860 |
-
from eval_tab import render_eval_tab
|
| 861 |
-
else:
|
| 862 |
-
from src.dashboard.eval_tab import render_eval_tab
|
| 863 |
-
render_eval_tab()
|
| 864 |
-
return
|
| 865 |
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
# --- Sidebar ---
|
| 870 |
-
with st.sidebar:
|
| 871 |
-
# === RESEARCH ===
|
| 872 |
-
st.header("Research")
|
| 873 |
-
|
| 874 |
-
queries_text = st.text_area(
|
| 875 |
-
"Queries",
|
| 876 |
-
value=DEFAULT_QUERIES,
|
| 877 |
-
height=150,
|
| 878 |
-
help="Enter search queries, one per line."
|
| 879 |
-
)
|
| 880 |
-
queries = [q.strip() for q in queries_text.strip().split("\n") if q.strip()]
|
| 881 |
-
|
| 882 |
-
# AI analysis option (only if LLM enabled)
|
| 883 |
-
analyze_with_ai = False
|
| 884 |
-
if LLM_ENABLED:
|
| 885 |
-
analyze_with_ai = st.checkbox("Analyze with AI", value=True, help="Use LLM to extract strategic insights from results")
|
| 886 |
-
|
| 887 |
-
if st.button(f"Run {len(queries)} searches", width="stretch", type="primary"):
|
| 888 |
-
progress = st.progress(0, text="Starting...")
|
| 889 |
-
success, total, failed, insights = run_expand_research(
|
| 890 |
-
queries,
|
| 891 |
-
progress_callback=lambda p, t: progress.progress(p, text=t),
|
| 892 |
-
analyze_with_ai=analyze_with_ai
|
| 893 |
-
)
|
| 894 |
-
progress.empty()
|
| 895 |
-
|
| 896 |
-
if success > 0:
|
| 897 |
-
msg = f"{success}/{total} searches done"
|
| 898 |
-
if insights:
|
| 899 |
-
msg += f" + {len(insights)} AI insights"
|
| 900 |
-
# Store insights in session state for display
|
| 901 |
-
st.session_state.last_insights = insights
|
| 902 |
-
st.success(msg)
|
| 903 |
-
st.cache_data.clear()
|
| 904 |
-
st.rerun()
|
| 905 |
-
|
| 906 |
-
# Show last AI insights if any
|
| 907 |
-
if st.session_state.get("last_insights"):
|
| 908 |
-
with st.expander("AI Insights", expanded=True):
|
| 909 |
-
for insight in st.session_state.last_insights:
|
| 910 |
-
st.caption(f"• {insight}")
|
| 911 |
-
if st.button("Clear", key="clear_insights"):
|
| 912 |
-
del st.session_state.last_insights
|
| 913 |
-
st.rerun()
|
| 914 |
-
|
| 915 |
-
st.divider()
|
| 916 |
-
|
| 917 |
-
# === DEEP INTEL ===
|
| 918 |
-
st.header("Deep Intel")
|
| 919 |
-
intel_company = st.selectbox("Competitor", COMPETITORS, index=0)
|
| 920 |
-
intel_categories = st.multiselect(
|
| 921 |
-
"Categories",
|
| 922 |
-
options=list(DEEP_INTEL_CATEGORIES.keys()),
|
| 923 |
-
default=list(DEEP_INTEL_CATEGORIES.keys()),
|
| 924 |
-
format_func=lambda k: DEEP_INTEL_CATEGORIES[k]["label"],
|
| 925 |
-
)
|
| 926 |
-
btn_col1, btn_col2 = st.columns([3, 1])
|
| 927 |
-
run_clicked = btn_col1.button("Run Deep Intel", width="stretch")
|
| 928 |
-
stop_clicked = btn_col2.button("Stop", key="stop_intel", width="stretch")
|
| 929 |
-
|
| 930 |
-
if stop_clicked:
|
| 931 |
-
st.session_state["intel_stop"] = True
|
| 932 |
-
|
| 933 |
-
if run_clicked:
|
| 934 |
-
st.session_state["intel_stop"] = False
|
| 935 |
-
agent = CompetitorIntelAgent(intel_company)
|
| 936 |
-
total_queries = sum(
|
| 937 |
-
len(DEEP_INTEL_CATEGORIES[c]["queries"])
|
| 938 |
-
for c in intel_categories if c in DEEP_INTEL_CATEGORIES
|
| 939 |
-
)
|
| 940 |
-
progress = st.progress(0, text=f"Starting {intel_company}...")
|
| 941 |
-
completed = [0]
|
| 942 |
-
|
| 943 |
-
original_search = agent.search.search
|
| 944 |
-
def _tracked_search(query, max_results=10, save=True):
|
| 945 |
-
if st.session_state.get("intel_stop"):
|
| 946 |
-
return []
|
| 947 |
-
completed[0] += 1
|
| 948 |
-
progress.progress(
|
| 949 |
-
min(completed[0] / max(total_queries, 1), 0.95),
|
| 950 |
-
text=f"[{completed[0]}/{total_queries}] {query[:40]}...",
|
| 951 |
-
)
|
| 952 |
-
return original_search(query, max_results=max_results, save=save)
|
| 953 |
-
agent.search.search = _tracked_search
|
| 954 |
-
|
| 955 |
-
report_path = agent.run(
|
| 956 |
-
categories=intel_categories or None,
|
| 957 |
-
delay=1.0,
|
| 958 |
-
)
|
| 959 |
-
progress.progress(1.0, text="Done!")
|
| 960 |
-
progress.empty()
|
| 961 |
-
|
| 962 |
-
stopped = st.session_state.get("intel_stop", False)
|
| 963 |
-
findings = sum(len(s.findings) for s in agent.sections.values())
|
| 964 |
-
gaps = sum(len(s.gaps) for s in agent.sections.values())
|
| 965 |
-
if stopped:
|
| 966 |
-
st.warning(f"Stopped early — {intel_company}: {findings} findings, {gaps} gaps (partial)")
|
| 967 |
-
else:
|
| 968 |
-
st.success(f"{intel_company}: {findings} findings, {gaps} gaps")
|
| 969 |
-
st.session_state["intel_stop"] = False
|
| 970 |
-
st.cache_data.clear()
|
| 971 |
-
st.rerun()
|
| 972 |
-
|
| 973 |
-
st.divider()
|
| 974 |
-
|
| 975 |
-
# === DATA ===
|
| 976 |
-
st.header("Data")
|
| 977 |
-
|
| 978 |
-
date_range = st.selectbox(
|
| 979 |
-
"Time range",
|
| 980 |
-
["All time", "Last 7 days", "Last 30 days", "Last 90 days"],
|
| 981 |
-
index=0,
|
| 982 |
-
label_visibility="collapsed"
|
| 983 |
-
)
|
| 984 |
-
|
| 985 |
-
col1, col2 = st.columns(2)
|
| 986 |
-
if col1.button("Refresh", width="stretch", help="Re-extract from research files"):
|
| 987 |
-
with st.spinner("..."):
|
| 988 |
-
run_extract()
|
| 989 |
-
st.cache_data.clear()
|
| 990 |
-
st.rerun()
|
| 991 |
-
|
| 992 |
-
if data:
|
| 993 |
-
report = export_html(data, research, date_range)
|
| 994 |
-
col2.download_button(
|
| 995 |
-
"Export",
|
| 996 |
-
report,
|
| 997 |
-
file_name=f"report-{datetime.now().strftime('%Y%m%d')}.html",
|
| 998 |
-
mime="text/html",
|
| 999 |
-
width="stretch"
|
| 1000 |
-
)
|
| 1001 |
-
|
| 1002 |
-
st.divider()
|
| 1003 |
-
|
| 1004 |
-
# === STATUS ===
|
| 1005 |
-
st.caption(f"{len(research)} files · Updated {data.get('_generated', 'N/A')[:10] if data else 'never'}")
|
| 1006 |
-
|
| 1007 |
-
if ACCESS_KEY and st.session_state.get("authenticated"):
|
| 1008 |
-
if st.button("Logout", width="stretch"):
|
| 1009 |
-
st.session_state.authenticated = False
|
| 1010 |
-
st.session_state.admin_authenticated = False
|
| 1011 |
-
st.query_params.pop("auth", None)
|
| 1012 |
-
st.query_params.pop("adm", None)
|
| 1013 |
-
st.rerun()
|
| 1014 |
-
|
| 1015 |
-
# === ADMIN: Access Log ===
|
| 1016 |
-
if ADMIN_KEY:
|
| 1017 |
-
# Auto-authenticate from URL token
|
| 1018 |
-
if not st.session_state.get("admin_authenticated"):
|
| 1019 |
-
if st.query_params.get("adm") == _auth_token(ADMIN_KEY, salt="gurma_adm"):
|
| 1020 |
-
st.session_state.admin_authenticated = True
|
| 1021 |
-
|
| 1022 |
-
st.divider()
|
| 1023 |
-
if st.session_state.get("admin_authenticated"):
|
| 1024 |
-
access_log = load_access_log()
|
| 1025 |
-
st.caption(f"Access log ({len(access_log)} entries)")
|
| 1026 |
-
if access_log:
|
| 1027 |
-
for entry in reversed(access_log[-20:]):
|
| 1028 |
-
st.caption(f"{entry.get('timestamp', '?')} · {entry.get('ip', '?')}")
|
| 1029 |
-
else:
|
| 1030 |
-
st.caption("No accesses recorded yet")
|
| 1031 |
-
else:
|
| 1032 |
-
with st.popover("Admin"):
|
| 1033 |
-
admin_input = st.text_input("Admin key", type="password", key="admin_key_input")
|
| 1034 |
-
if st.button("Unlock", key="admin_unlock"):
|
| 1035 |
-
if admin_input == ADMIN_KEY:
|
| 1036 |
-
st.session_state.admin_authenticated = True
|
| 1037 |
-
st.query_params["adm"] = _auth_token(ADMIN_KEY, salt="gurma_adm")
|
| 1038 |
-
st.rerun()
|
| 1039 |
-
else:
|
| 1040 |
-
st.error("Invalid")
|
| 1041 |
-
|
| 1042 |
-
# --- Log access ---
|
| 1043 |
-
log_access()
|
| 1044 |
-
|
| 1045 |
-
# --- Main Content ---
|
| 1046 |
-
st.title("Rehabilitation Robotics — Competitive Landscape")
|
| 1047 |
-
|
| 1048 |
if not data:
|
| 1049 |
st.warning("No competitor data found.")
|
| 1050 |
st.markdown("**First time?** Run the research pipeline to get started:")
|
| 1051 |
-
|
| 1052 |
col_init1, col_init2 = st.columns(2)
|
| 1053 |
-
|
| 1054 |
if col_init1.button("Quick Start (10 searches)", type="primary", width="stretch"):
|
| 1055 |
with st.spinner("Running core competitor searches..."):
|
| 1056 |
core_queries = [
|
|
@@ -1071,15 +864,15 @@ def main():
|
|
| 1071 |
progress.progress((i + 1) / (len(core_queries) + 1), f"Searching: {q[:30]}...")
|
| 1072 |
if run_search(q):
|
| 1073 |
success += 1
|
| 1074 |
-
|
| 1075 |
progress.progress(1.0, "Extracting data...")
|
| 1076 |
run_extract()
|
| 1077 |
progress.empty()
|
| 1078 |
-
|
| 1079 |
st.success(f"Done! {success}/{len(core_queries)} searches completed.")
|
| 1080 |
st.cache_data.clear()
|
| 1081 |
st.rerun()
|
| 1082 |
-
|
| 1083 |
if col_init2.button("Full Research (47 searches)", width="stretch"):
|
| 1084 |
with st.spinner("Running full competitor research..."):
|
| 1085 |
queries = []
|
|
@@ -1087,45 +880,45 @@ def main():
|
|
| 1087 |
for template in BATCH_QUERY_TEMPLATES:
|
| 1088 |
queries.append(template.format(company=company))
|
| 1089 |
queries.extend(MARKET_QUERIES)
|
| 1090 |
-
|
| 1091 |
progress = st.progress(0, "Starting...")
|
| 1092 |
success = 0
|
| 1093 |
for i, q in enumerate(queries):
|
| 1094 |
progress.progress((i + 1) / (len(queries) + 1), f"[{i+1}/{len(queries)}] {q[:30]}...")
|
| 1095 |
if run_search(q):
|
| 1096 |
success += 1
|
| 1097 |
-
|
| 1098 |
progress.progress(1.0, "Extracting data...")
|
| 1099 |
run_extract()
|
| 1100 |
progress.empty()
|
| 1101 |
-
|
| 1102 |
st.success(f"Done! {success}/{len(queries)} searches completed.")
|
| 1103 |
st.cache_data.clear()
|
| 1104 |
st.rerun()
|
| 1105 |
-
|
| 1106 |
return
|
| 1107 |
-
|
| 1108 |
competitors = data.get("competitors", [])
|
| 1109 |
market = data.get("market", {})
|
| 1110 |
-
|
| 1111 |
# ===== MARKET & OPPORTUNITY =====
|
| 1112 |
col_market, col_opp = st.columns([1, 1])
|
| 1113 |
-
|
| 1114 |
with col_market:
|
| 1115 |
st.markdown("### Market")
|
| 1116 |
-
|
| 1117 |
size_2024 = market.get('size_2024', 2e9)
|
| 1118 |
size_2029 = market.get('size_2029_ai', 9.1e9)
|
| 1119 |
cagr = market.get('cagr', 0.278)
|
| 1120 |
-
|
| 1121 |
m1, m2, m3 = st.columns(3)
|
| 1122 |
m1.metric("2024 Market", f"${size_2024/1e9:.1f}B")
|
| 1123 |
m2.metric("2029 AI Segment", f"${size_2029/1e9:.1f}B")
|
| 1124 |
m3.metric("CAGR", f"{cagr*100:.1f}%")
|
| 1125 |
-
|
| 1126 |
growth_pct = min((size_2029 / size_2024 - 1) * 100, 400)
|
| 1127 |
st.progress(growth_pct / 400, text=f"{growth_pct:.0f}% projected growth (2024→2029)")
|
| 1128 |
-
|
| 1129 |
with col_opp:
|
| 1130 |
opportunity = data.get("opportunity", {})
|
| 1131 |
headline = opportunity.get("headline", "Market opportunity detected")
|
|
@@ -1133,18 +926,16 @@ def main():
|
|
| 1133 |
confirmed = opportunity.get("confirmed", False)
|
| 1134 |
update_available = opportunity.get("update_available", False)
|
| 1135 |
detected_at = opportunity.get("detected_at", "")
|
| 1136 |
-
|
| 1137 |
if confirmed:
|
| 1138 |
badge = f"<span style='color: #2ecc71;'>● Confirmed {opportunity.get('confirmed_at', detected_at)}</span>"
|
| 1139 |
elif update_available:
|
| 1140 |
badge = "<span style='color: #e67e22;'>● Update available</span>"
|
| 1141 |
else:
|
| 1142 |
badge = f"<span style='color: #3498db;'>● Auto-detected {detected_at}</span>"
|
| 1143 |
-
|
| 1144 |
-
# Source indicators
|
| 1145 |
sources = opportunity.get("sources", [])
|
| 1146 |
if not sources:
|
| 1147 |
-
# Backward compat with old intel_sourced/llm_synthesized booleans
|
| 1148 |
if opportunity.get("intel_sourced"):
|
| 1149 |
sources.append("intel")
|
| 1150 |
if opportunity.get("llm_synthesized"):
|
|
@@ -1155,9 +946,9 @@ def main():
|
|
| 1155 |
for s in sources if s in badge_labels
|
| 1156 |
]
|
| 1157 |
source_html = " ".join(source_tags)
|
| 1158 |
-
|
| 1159 |
points_html = "".join(f"<li>{p}</li>" for p in points[:4])
|
| 1160 |
-
|
| 1161 |
st.markdown(f"""
|
| 1162 |
<div style="
|
| 1163 |
background: linear-gradient(135deg, #1a472a 0%, #2d5a3c 100%);
|
|
@@ -1174,7 +965,7 @@ def main():
|
|
| 1174 |
</ul>
|
| 1175 |
</div>
|
| 1176 |
""", unsafe_allow_html=True)
|
| 1177 |
-
|
| 1178 |
opp_col1, opp_col2 = st.columns(2)
|
| 1179 |
if not confirmed:
|
| 1180 |
if opp_col1.button("Confirm", key="confirm_opp", width="stretch"):
|
|
@@ -1186,19 +977,19 @@ def main():
|
|
| 1186 |
apply_opportunity_update()
|
| 1187 |
st.cache_data.clear()
|
| 1188 |
st.rerun()
|
| 1189 |
-
|
| 1190 |
st.divider()
|
| 1191 |
-
|
| 1192 |
# ===== COMPETITOR CARDS =====
|
| 1193 |
st.header("Competitors")
|
| 1194 |
-
|
| 1195 |
sorted_competitors = sorted(competitors, key=lambda x: x.get("mentions", 0), reverse=True)
|
| 1196 |
-
|
| 1197 |
legend_items = [f"<span style='color: {v['color']};'>●</span> {v['label']}" for k, v in STATUS_CONFIG.items() if k != "unknown"]
|
| 1198 |
st.markdown(" | ".join(legend_items), unsafe_allow_html=True)
|
| 1199 |
-
|
| 1200 |
col1, col2 = st.columns(2)
|
| 1201 |
-
|
| 1202 |
for i, comp in enumerate(sorted_competitors):
|
| 1203 |
with col1 if i % 2 == 0 else col2:
|
| 1204 |
status = comp.get("status", "unknown")
|
|
@@ -1206,7 +997,7 @@ def main():
|
|
| 1206 |
color = status_color(status)
|
| 1207 |
label = status_label(status)
|
| 1208 |
mentions = comp.get("mentions", 0)
|
| 1209 |
-
|
| 1210 |
st.markdown(f"""
|
| 1211 |
<div style="
|
| 1212 |
border: 1px solid {color}40;
|
|
@@ -1232,37 +1023,37 @@ def main():
|
|
| 1232 |
</div>
|
| 1233 |
</div>
|
| 1234 |
""", unsafe_allow_html=True)
|
| 1235 |
-
|
| 1236 |
with st.expander("Details", expanded=False):
|
| 1237 |
m1, m2 = st.columns(2)
|
| 1238 |
if comp.get("stock"):
|
| 1239 |
m1.metric("Stock", f"${comp['stock']:.2f}")
|
| 1240 |
if comp.get("funding"):
|
| 1241 |
m2.metric("Funding", f"${comp['funding']/1e6:.0f}M")
|
| 1242 |
-
|
| 1243 |
if comp.get("notes"):
|
| 1244 |
st.caption(comp["notes"][:200] + "..." if len(comp.get("notes", "")) > 200 else comp.get("notes", ""))
|
| 1245 |
-
|
| 1246 |
events = comp.get("events", [])[:3]
|
| 1247 |
if events:
|
| 1248 |
st.markdown("**Recent:**")
|
| 1249 |
for e in events:
|
| 1250 |
st.caption(f"• {e.get('date', 'N/A')}: {e.get('event', '')[:80]}...")
|
| 1251 |
-
|
| 1252 |
urls = comp.get("sample_urls", [])[:2]
|
| 1253 |
if urls:
|
| 1254 |
for url in urls:
|
| 1255 |
st.markdown(f"[Source →]({url})")
|
| 1256 |
-
|
| 1257 |
# --- Timeline ---
|
| 1258 |
st.header("Timeline")
|
| 1259 |
-
|
| 1260 |
timeline_fig = build_timeline_figure(sorted_competitors, date_range)
|
| 1261 |
if timeline_fig:
|
| 1262 |
st.plotly_chart(timeline_fig, width="stretch")
|
| 1263 |
else:
|
| 1264 |
st.info("No events in selected time range")
|
| 1265 |
-
|
| 1266 |
# --- Deep Intel ---
|
| 1267 |
intel_reports = load_intel_reports()
|
| 1268 |
if intel_reports:
|
|
@@ -1320,7 +1111,7 @@ def main():
|
|
| 1320 |
with st.expander("Recent News", expanded=False):
|
| 1321 |
news_by_company = {c["name"]: [] for c in competitors}
|
| 1322 |
news_by_company["Other"] = []
|
| 1323 |
-
|
| 1324 |
for r in research:
|
| 1325 |
timestamp = r.get("timestamp", "")
|
| 1326 |
for result in r.get("results", []):
|
|
@@ -1330,7 +1121,7 @@ def main():
|
|
| 1330 |
"url": result.get("url", ""),
|
| 1331 |
"date": timestamp[:10] if timestamp else ""
|
| 1332 |
}
|
| 1333 |
-
|
| 1334 |
text = (item["title"] + " " + item["snippet"]).lower()
|
| 1335 |
found = False
|
| 1336 |
for comp in competitors:
|
|
@@ -1341,13 +1132,13 @@ def main():
|
|
| 1341 |
break
|
| 1342 |
if not found:
|
| 1343 |
news_by_company["Other"].append(item)
|
| 1344 |
-
|
| 1345 |
company_options = ["All"] + [c["name"] for c in competitors if news_by_company.get(c["name"])]
|
| 1346 |
company_filter = st.selectbox("Filter by company", company_options, index=0)
|
| 1347 |
-
|
| 1348 |
displayed = 0
|
| 1349 |
max_display = 12
|
| 1350 |
-
|
| 1351 |
if company_filter == "All":
|
| 1352 |
active_companies = [c for c in competitors if news_by_company.get(c["name"])]
|
| 1353 |
per_company = max(2, max_display // len(active_companies)) if active_companies else 0
|
|
@@ -1363,10 +1154,230 @@ def main():
|
|
| 1363 |
for item in items[:max_display]:
|
| 1364 |
_render_news_item(item, company_filter)
|
| 1365 |
displayed += 1
|
| 1366 |
-
|
| 1367 |
if displayed == 0:
|
| 1368 |
st.info("No news found. Run some searches!")
|
| 1369 |
|
| 1370 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1371 |
if __name__ == "__main__":
|
| 1372 |
main()
|
|
|
|
| 25 |
IS_HF_SPACE = os.getenv("HF_SPACE") or Path("/app/research.py").exists()
|
| 26 |
|
| 27 |
if IS_HF_SPACE:
|
| 28 |
+
# HF Space: research.py shim re-exports everything
|
| 29 |
from research import (
|
| 30 |
SearchService,
|
| 31 |
CompetitorExtractor,
|
|
|
|
| 41 |
LLM_ENABLED,
|
| 42 |
)
|
| 43 |
else:
|
| 44 |
+
# Local: import via package __init__
|
| 45 |
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
| 46 |
+
from src.utils import (
|
| 47 |
SearchService,
|
| 48 |
CompetitorExtractor,
|
| 49 |
CompetitorIntelAgent,
|
|
|
|
| 832 |
|
| 833 |
|
| 834 |
# ============================================================
|
| 835 |
+
# Competitive Intel Page
|
| 836 |
# ============================================================
|
| 837 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 838 |
|
| 839 |
+
def _render_intel_page(data, research, date_range):
|
| 840 |
+
"""Competitive intel main content — rendered inside its tab."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
if not data:
|
| 842 |
st.warning("No competitor data found.")
|
| 843 |
st.markdown("**First time?** Run the research pipeline to get started:")
|
| 844 |
+
|
| 845 |
col_init1, col_init2 = st.columns(2)
|
| 846 |
+
|
| 847 |
if col_init1.button("Quick Start (10 searches)", type="primary", width="stretch"):
|
| 848 |
with st.spinner("Running core competitor searches..."):
|
| 849 |
core_queries = [
|
|
|
|
| 864 |
progress.progress((i + 1) / (len(core_queries) + 1), f"Searching: {q[:30]}...")
|
| 865 |
if run_search(q):
|
| 866 |
success += 1
|
| 867 |
+
|
| 868 |
progress.progress(1.0, "Extracting data...")
|
| 869 |
run_extract()
|
| 870 |
progress.empty()
|
| 871 |
+
|
| 872 |
st.success(f"Done! {success}/{len(core_queries)} searches completed.")
|
| 873 |
st.cache_data.clear()
|
| 874 |
st.rerun()
|
| 875 |
+
|
| 876 |
if col_init2.button("Full Research (47 searches)", width="stretch"):
|
| 877 |
with st.spinner("Running full competitor research..."):
|
| 878 |
queries = []
|
|
|
|
| 880 |
for template in BATCH_QUERY_TEMPLATES:
|
| 881 |
queries.append(template.format(company=company))
|
| 882 |
queries.extend(MARKET_QUERIES)
|
| 883 |
+
|
| 884 |
progress = st.progress(0, "Starting...")
|
| 885 |
success = 0
|
| 886 |
for i, q in enumerate(queries):
|
| 887 |
progress.progress((i + 1) / (len(queries) + 1), f"[{i+1}/{len(queries)}] {q[:30]}...")
|
| 888 |
if run_search(q):
|
| 889 |
success += 1
|
| 890 |
+
|
| 891 |
progress.progress(1.0, "Extracting data...")
|
| 892 |
run_extract()
|
| 893 |
progress.empty()
|
| 894 |
+
|
| 895 |
st.success(f"Done! {success}/{len(queries)} searches completed.")
|
| 896 |
st.cache_data.clear()
|
| 897 |
st.rerun()
|
| 898 |
+
|
| 899 |
return
|
| 900 |
+
|
| 901 |
competitors = data.get("competitors", [])
|
| 902 |
market = data.get("market", {})
|
| 903 |
+
|
| 904 |
# ===== MARKET & OPPORTUNITY =====
|
| 905 |
col_market, col_opp = st.columns([1, 1])
|
| 906 |
+
|
| 907 |
with col_market:
|
| 908 |
st.markdown("### Market")
|
| 909 |
+
|
| 910 |
size_2024 = market.get('size_2024', 2e9)
|
| 911 |
size_2029 = market.get('size_2029_ai', 9.1e9)
|
| 912 |
cagr = market.get('cagr', 0.278)
|
| 913 |
+
|
| 914 |
m1, m2, m3 = st.columns(3)
|
| 915 |
m1.metric("2024 Market", f"${size_2024/1e9:.1f}B")
|
| 916 |
m2.metric("2029 AI Segment", f"${size_2029/1e9:.1f}B")
|
| 917 |
m3.metric("CAGR", f"{cagr*100:.1f}%")
|
| 918 |
+
|
| 919 |
growth_pct = min((size_2029 / size_2024 - 1) * 100, 400)
|
| 920 |
st.progress(growth_pct / 400, text=f"{growth_pct:.0f}% projected growth (2024→2029)")
|
| 921 |
+
|
| 922 |
with col_opp:
|
| 923 |
opportunity = data.get("opportunity", {})
|
| 924 |
headline = opportunity.get("headline", "Market opportunity detected")
|
|
|
|
| 926 |
confirmed = opportunity.get("confirmed", False)
|
| 927 |
update_available = opportunity.get("update_available", False)
|
| 928 |
detected_at = opportunity.get("detected_at", "")
|
| 929 |
+
|
| 930 |
if confirmed:
|
| 931 |
badge = f"<span style='color: #2ecc71;'>● Confirmed {opportunity.get('confirmed_at', detected_at)}</span>"
|
| 932 |
elif update_available:
|
| 933 |
badge = "<span style='color: #e67e22;'>● Update available</span>"
|
| 934 |
else:
|
| 935 |
badge = f"<span style='color: #3498db;'>● Auto-detected {detected_at}</span>"
|
| 936 |
+
|
|
|
|
| 937 |
sources = opportunity.get("sources", [])
|
| 938 |
if not sources:
|
|
|
|
| 939 |
if opportunity.get("intel_sourced"):
|
| 940 |
sources.append("intel")
|
| 941 |
if opportunity.get("llm_synthesized"):
|
|
|
|
| 946 |
for s in sources if s in badge_labels
|
| 947 |
]
|
| 948 |
source_html = " ".join(source_tags)
|
| 949 |
+
|
| 950 |
points_html = "".join(f"<li>{p}</li>" for p in points[:4])
|
| 951 |
+
|
| 952 |
st.markdown(f"""
|
| 953 |
<div style="
|
| 954 |
background: linear-gradient(135deg, #1a472a 0%, #2d5a3c 100%);
|
|
|
|
| 965 |
</ul>
|
| 966 |
</div>
|
| 967 |
""", unsafe_allow_html=True)
|
| 968 |
+
|
| 969 |
opp_col1, opp_col2 = st.columns(2)
|
| 970 |
if not confirmed:
|
| 971 |
if opp_col1.button("Confirm", key="confirm_opp", width="stretch"):
|
|
|
|
| 977 |
apply_opportunity_update()
|
| 978 |
st.cache_data.clear()
|
| 979 |
st.rerun()
|
| 980 |
+
|
| 981 |
st.divider()
|
| 982 |
+
|
| 983 |
# ===== COMPETITOR CARDS =====
|
| 984 |
st.header("Competitors")
|
| 985 |
+
|
| 986 |
sorted_competitors = sorted(competitors, key=lambda x: x.get("mentions", 0), reverse=True)
|
| 987 |
+
|
| 988 |
legend_items = [f"<span style='color: {v['color']};'>●</span> {v['label']}" for k, v in STATUS_CONFIG.items() if k != "unknown"]
|
| 989 |
st.markdown(" | ".join(legend_items), unsafe_allow_html=True)
|
| 990 |
+
|
| 991 |
col1, col2 = st.columns(2)
|
| 992 |
+
|
| 993 |
for i, comp in enumerate(sorted_competitors):
|
| 994 |
with col1 if i % 2 == 0 else col2:
|
| 995 |
status = comp.get("status", "unknown")
|
|
|
|
| 997 |
color = status_color(status)
|
| 998 |
label = status_label(status)
|
| 999 |
mentions = comp.get("mentions", 0)
|
| 1000 |
+
|
| 1001 |
st.markdown(f"""
|
| 1002 |
<div style="
|
| 1003 |
border: 1px solid {color}40;
|
|
|
|
| 1023 |
</div>
|
| 1024 |
</div>
|
| 1025 |
""", unsafe_allow_html=True)
|
| 1026 |
+
|
| 1027 |
with st.expander("Details", expanded=False):
|
| 1028 |
m1, m2 = st.columns(2)
|
| 1029 |
if comp.get("stock"):
|
| 1030 |
m1.metric("Stock", f"${comp['stock']:.2f}")
|
| 1031 |
if comp.get("funding"):
|
| 1032 |
m2.metric("Funding", f"${comp['funding']/1e6:.0f}M")
|
| 1033 |
+
|
| 1034 |
if comp.get("notes"):
|
| 1035 |
st.caption(comp["notes"][:200] + "..." if len(comp.get("notes", "")) > 200 else comp.get("notes", ""))
|
| 1036 |
+
|
| 1037 |
events = comp.get("events", [])[:3]
|
| 1038 |
if events:
|
| 1039 |
st.markdown("**Recent:**")
|
| 1040 |
for e in events:
|
| 1041 |
st.caption(f"• {e.get('date', 'N/A')}: {e.get('event', '')[:80]}...")
|
| 1042 |
+
|
| 1043 |
urls = comp.get("sample_urls", [])[:2]
|
| 1044 |
if urls:
|
| 1045 |
for url in urls:
|
| 1046 |
st.markdown(f"[Source →]({url})")
|
| 1047 |
+
|
| 1048 |
# --- Timeline ---
|
| 1049 |
st.header("Timeline")
|
| 1050 |
+
|
| 1051 |
timeline_fig = build_timeline_figure(sorted_competitors, date_range)
|
| 1052 |
if timeline_fig:
|
| 1053 |
st.plotly_chart(timeline_fig, width="stretch")
|
| 1054 |
else:
|
| 1055 |
st.info("No events in selected time range")
|
| 1056 |
+
|
| 1057 |
# --- Deep Intel ---
|
| 1058 |
intel_reports = load_intel_reports()
|
| 1059 |
if intel_reports:
|
|
|
|
| 1111 |
with st.expander("Recent News", expanded=False):
|
| 1112 |
news_by_company = {c["name"]: [] for c in competitors}
|
| 1113 |
news_by_company["Other"] = []
|
| 1114 |
+
|
| 1115 |
for r in research:
|
| 1116 |
timestamp = r.get("timestamp", "")
|
| 1117 |
for result in r.get("results", []):
|
|
|
|
| 1121 |
"url": result.get("url", ""),
|
| 1122 |
"date": timestamp[:10] if timestamp else ""
|
| 1123 |
}
|
| 1124 |
+
|
| 1125 |
text = (item["title"] + " " + item["snippet"]).lower()
|
| 1126 |
found = False
|
| 1127 |
for comp in competitors:
|
|
|
|
| 1132 |
break
|
| 1133 |
if not found:
|
| 1134 |
news_by_company["Other"].append(item)
|
| 1135 |
+
|
| 1136 |
company_options = ["All"] + [c["name"] for c in competitors if news_by_company.get(c["name"])]
|
| 1137 |
company_filter = st.selectbox("Filter by company", company_options, index=0)
|
| 1138 |
+
|
| 1139 |
displayed = 0
|
| 1140 |
max_display = 12
|
| 1141 |
+
|
| 1142 |
if company_filter == "All":
|
| 1143 |
active_companies = [c for c in competitors if news_by_company.get(c["name"])]
|
| 1144 |
per_company = max(2, max_display // len(active_companies)) if active_companies else 0
|
|
|
|
| 1154 |
for item in items[:max_display]:
|
| 1155 |
_render_news_item(item, company_filter)
|
| 1156 |
displayed += 1
|
| 1157 |
+
|
| 1158 |
if displayed == 0:
|
| 1159 |
st.info("No news found. Run some searches!")
|
| 1160 |
|
| 1161 |
|
| 1162 |
+
# ============================================================
|
| 1163 |
+
# Main Application
|
| 1164 |
+
# ============================================================
|
| 1165 |
+
|
| 1166 |
+
def main():
|
| 1167 |
+
# Check access
|
| 1168 |
+
if not check_access():
|
| 1169 |
+
show_login_page()
|
| 1170 |
+
return
|
| 1171 |
+
|
| 1172 |
+
# On HF Space, optionally hydrate runtime data from a private dataset repo.
|
| 1173 |
+
sync_status = sync_private_data_if_configured()
|
| 1174 |
+
if sync_status.get("status") == "error":
|
| 1175 |
+
st.error(f"Private data sync failed: {sync_status.get('reason', 'unknown error')}")
|
| 1176 |
+
return
|
| 1177 |
+
|
| 1178 |
+
data = load_data()
|
| 1179 |
+
research = load_research_files()
|
| 1180 |
+
|
| 1181 |
+
# --- Sidebar (Competitive Intel controls) ---
|
| 1182 |
+
with st.sidebar:
|
| 1183 |
+
# === RESEARCH ===
|
| 1184 |
+
st.header("Research")
|
| 1185 |
+
|
| 1186 |
+
queries_text = st.text_area(
|
| 1187 |
+
"Queries",
|
| 1188 |
+
value=DEFAULT_QUERIES,
|
| 1189 |
+
height=150,
|
| 1190 |
+
help="Enter search queries, one per line."
|
| 1191 |
+
)
|
| 1192 |
+
queries = [q.strip() for q in queries_text.strip().split("\n") if q.strip()]
|
| 1193 |
+
|
| 1194 |
+
# AI analysis option (only if LLM enabled)
|
| 1195 |
+
analyze_with_ai = False
|
| 1196 |
+
if LLM_ENABLED:
|
| 1197 |
+
analyze_with_ai = st.checkbox("Analyze with AI", value=True, help="Use LLM to extract strategic insights from results")
|
| 1198 |
+
|
| 1199 |
+
if st.button(f"Run {len(queries)} searches", width="stretch", type="primary"):
|
| 1200 |
+
progress = st.progress(0, text="Starting...")
|
| 1201 |
+
success, total, failed, insights = run_expand_research(
|
| 1202 |
+
queries,
|
| 1203 |
+
progress_callback=lambda p, t: progress.progress(p, text=t),
|
| 1204 |
+
analyze_with_ai=analyze_with_ai
|
| 1205 |
+
)
|
| 1206 |
+
progress.empty()
|
| 1207 |
+
|
| 1208 |
+
if success > 0:
|
| 1209 |
+
msg = f"{success}/{total} searches done"
|
| 1210 |
+
if insights:
|
| 1211 |
+
msg += f" + {len(insights)} AI insights"
|
| 1212 |
+
# Store insights in session state for display
|
| 1213 |
+
st.session_state.last_insights = insights
|
| 1214 |
+
st.success(msg)
|
| 1215 |
+
st.cache_data.clear()
|
| 1216 |
+
st.rerun()
|
| 1217 |
+
|
| 1218 |
+
# Show last AI insights if any
|
| 1219 |
+
if st.session_state.get("last_insights"):
|
| 1220 |
+
with st.expander("AI Insights", expanded=True):
|
| 1221 |
+
for insight in st.session_state.last_insights:
|
| 1222 |
+
st.caption(f"• {insight}")
|
| 1223 |
+
if st.button("Clear", key="clear_insights"):
|
| 1224 |
+
del st.session_state.last_insights
|
| 1225 |
+
st.rerun()
|
| 1226 |
+
|
| 1227 |
+
st.divider()
|
| 1228 |
+
|
| 1229 |
+
# === DEEP INTEL ===
|
| 1230 |
+
st.header("Deep Intel")
|
| 1231 |
+
intel_company = st.selectbox("Competitor", COMPETITORS, index=0)
|
| 1232 |
+
intel_categories = st.multiselect(
|
| 1233 |
+
"Categories",
|
| 1234 |
+
options=list(DEEP_INTEL_CATEGORIES.keys()),
|
| 1235 |
+
default=list(DEEP_INTEL_CATEGORIES.keys()),
|
| 1236 |
+
format_func=lambda k: DEEP_INTEL_CATEGORIES[k]["label"],
|
| 1237 |
+
)
|
| 1238 |
+
btn_col1, btn_col2 = st.columns([3, 1])
|
| 1239 |
+
run_clicked = btn_col1.button("Run Deep Intel", width="stretch")
|
| 1240 |
+
stop_clicked = btn_col2.button("Stop", key="stop_intel", width="stretch")
|
| 1241 |
+
|
| 1242 |
+
if stop_clicked:
|
| 1243 |
+
st.session_state["intel_stop"] = True
|
| 1244 |
+
|
| 1245 |
+
if run_clicked:
|
| 1246 |
+
st.session_state["intel_stop"] = False
|
| 1247 |
+
agent = CompetitorIntelAgent(intel_company)
|
| 1248 |
+
total_queries = sum(
|
| 1249 |
+
len(DEEP_INTEL_CATEGORIES[c]["queries"])
|
| 1250 |
+
for c in intel_categories if c in DEEP_INTEL_CATEGORIES
|
| 1251 |
+
)
|
| 1252 |
+
progress = st.progress(0, text=f"Starting {intel_company}...")
|
| 1253 |
+
completed = [0]
|
| 1254 |
+
|
| 1255 |
+
original_search = agent.search.search
|
| 1256 |
+
def _tracked_search(query, max_results=10, save=True):
|
| 1257 |
+
if st.session_state.get("intel_stop"):
|
| 1258 |
+
return []
|
| 1259 |
+
completed[0] += 1
|
| 1260 |
+
progress.progress(
|
| 1261 |
+
min(completed[0] / max(total_queries, 1), 0.95),
|
| 1262 |
+
text=f"[{completed[0]}/{total_queries}] {query[:40]}...",
|
| 1263 |
+
)
|
| 1264 |
+
return original_search(query, max_results=max_results, save=save)
|
| 1265 |
+
agent.search.search = _tracked_search
|
| 1266 |
+
|
| 1267 |
+
report_path = agent.run(
|
| 1268 |
+
categories=intel_categories or None,
|
| 1269 |
+
delay=1.0,
|
| 1270 |
+
)
|
| 1271 |
+
progress.progress(1.0, text="Done!")
|
| 1272 |
+
progress.empty()
|
| 1273 |
+
|
| 1274 |
+
stopped = st.session_state.get("intel_stop", False)
|
| 1275 |
+
findings = sum(len(s.findings) for s in agent.sections.values())
|
| 1276 |
+
gaps = sum(len(s.gaps) for s in agent.sections.values())
|
| 1277 |
+
if stopped:
|
| 1278 |
+
st.warning(f"Stopped early — {intel_company}: {findings} findings, {gaps} gaps (partial)")
|
| 1279 |
+
else:
|
| 1280 |
+
st.success(f"{intel_company}: {findings} findings, {gaps} gaps")
|
| 1281 |
+
st.session_state["intel_stop"] = False
|
| 1282 |
+
st.cache_data.clear()
|
| 1283 |
+
st.rerun()
|
| 1284 |
+
|
| 1285 |
+
st.divider()
|
| 1286 |
+
|
| 1287 |
+
# === DATA ===
|
| 1288 |
+
st.header("Data")
|
| 1289 |
+
|
| 1290 |
+
date_range = st.selectbox(
|
| 1291 |
+
"Time range",
|
| 1292 |
+
["All time", "Last 7 days", "Last 30 days", "Last 90 days"],
|
| 1293 |
+
index=0,
|
| 1294 |
+
label_visibility="collapsed"
|
| 1295 |
+
)
|
| 1296 |
+
|
| 1297 |
+
col1, col2 = st.columns(2)
|
| 1298 |
+
if col1.button("Refresh", width="stretch", help="Re-extract from research files"):
|
| 1299 |
+
with st.spinner("..."):
|
| 1300 |
+
run_extract()
|
| 1301 |
+
st.cache_data.clear()
|
| 1302 |
+
st.rerun()
|
| 1303 |
+
|
| 1304 |
+
if data:
|
| 1305 |
+
report = export_html(data, research, date_range)
|
| 1306 |
+
col2.download_button(
|
| 1307 |
+
"Export",
|
| 1308 |
+
report,
|
| 1309 |
+
file_name=f"report-{datetime.now().strftime('%Y%m%d')}.html",
|
| 1310 |
+
mime="text/html",
|
| 1311 |
+
width="stretch"
|
| 1312 |
+
)
|
| 1313 |
+
|
| 1314 |
+
st.divider()
|
| 1315 |
+
|
| 1316 |
+
# === STATUS ===
|
| 1317 |
+
st.caption(f"{len(research)} files · Updated {data.get('_generated', 'N/A')[:10] if data else 'never'}")
|
| 1318 |
+
|
| 1319 |
+
if ACCESS_KEY and st.session_state.get("authenticated"):
|
| 1320 |
+
if st.button("Logout", width="stretch"):
|
| 1321 |
+
st.session_state.authenticated = False
|
| 1322 |
+
st.session_state.admin_authenticated = False
|
| 1323 |
+
st.query_params.pop("auth", None)
|
| 1324 |
+
st.query_params.pop("adm", None)
|
| 1325 |
+
st.rerun()
|
| 1326 |
+
|
| 1327 |
+
# === ADMIN: Access Log ===
|
| 1328 |
+
if ADMIN_KEY:
|
| 1329 |
+
# Auto-authenticate from URL token
|
| 1330 |
+
if not st.session_state.get("admin_authenticated"):
|
| 1331 |
+
if st.query_params.get("adm") == _auth_token(ADMIN_KEY, salt="gurma_adm"):
|
| 1332 |
+
st.session_state.admin_authenticated = True
|
| 1333 |
+
|
| 1334 |
+
st.divider()
|
| 1335 |
+
if st.session_state.get("admin_authenticated"):
|
| 1336 |
+
access_log = load_access_log()
|
| 1337 |
+
st.caption(f"Access log ({len(access_log)} entries)")
|
| 1338 |
+
if access_log:
|
| 1339 |
+
for entry in reversed(access_log[-20:]):
|
| 1340 |
+
st.caption(f"{entry.get('timestamp', '?')} · {entry.get('ip', '?')}")
|
| 1341 |
+
else:
|
| 1342 |
+
st.caption("No accesses recorded yet")
|
| 1343 |
+
else:
|
| 1344 |
+
with st.popover("Admin"):
|
| 1345 |
+
admin_input = st.text_input("Admin key", type="password", key="admin_key_input")
|
| 1346 |
+
if st.button("Unlock", key="admin_unlock"):
|
| 1347 |
+
if admin_input == ADMIN_KEY:
|
| 1348 |
+
st.session_state.admin_authenticated = True
|
| 1349 |
+
st.query_params["adm"] = _auth_token(ADMIN_KEY, salt="gurma_adm")
|
| 1350 |
+
st.rerun()
|
| 1351 |
+
else:
|
| 1352 |
+
st.error("Invalid")
|
| 1353 |
+
|
| 1354 |
+
# --- Log access ---
|
| 1355 |
+
log_access()
|
| 1356 |
+
|
| 1357 |
+
# --- Main Content (Tabs) ---
|
| 1358 |
+
tab_intel, tab_eval, tab_tr = st.tabs([
|
| 1359 |
+
"Competitive Intel",
|
| 1360 |
+
"Model Evaluation",
|
| 1361 |
+
"Turkey Expansion",
|
| 1362 |
+
])
|
| 1363 |
+
|
| 1364 |
+
with tab_intel:
|
| 1365 |
+
_render_intel_page(data, research, date_range)
|
| 1366 |
+
|
| 1367 |
+
with tab_eval:
|
| 1368 |
+
if IS_HF_SPACE:
|
| 1369 |
+
from eval_tab import render_eval_tab
|
| 1370 |
+
else:
|
| 1371 |
+
from src.dashboard.eval_tab import render_eval_tab
|
| 1372 |
+
render_eval_tab()
|
| 1373 |
+
|
| 1374 |
+
with tab_tr:
|
| 1375 |
+
if IS_HF_SPACE:
|
| 1376 |
+
from tr_tab import render_tr_tab
|
| 1377 |
+
else:
|
| 1378 |
+
from src.dashboard.tr_tab import render_tr_tab
|
| 1379 |
+
render_tr_tab()
|
| 1380 |
+
|
| 1381 |
+
|
| 1382 |
if __name__ == "__main__":
|
| 1383 |
main()
|
cli.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
GURMA.ai Research Tool — CLI entry point.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python research.py search "rehabilitation robotics market"
|
| 7 |
+
python research.py batch
|
| 8 |
+
python research.py competitor "Ekso Bionics"
|
| 9 |
+
python research.py competitor --list-categories
|
| 10 |
+
python research.py extract
|
| 11 |
+
python research.py list
|
| 12 |
+
python research.py sota
|
| 13 |
+
python research.py sota --analyze notes/research/podcast.md
|
| 14 |
+
python research.py mali
|
| 15 |
+
python research.py fonlar -c tubitak
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import argparse
|
| 21 |
+
import sys
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
from .config import RESEARCH_DIR, COMPETITORS, BATCH_QUERY_TEMPLATES, MARKET_QUERIES, LLM_ENABLED
|
| 25 |
+
from .search import SearchService, ResultStorage
|
| 26 |
+
from .extract import CompetitorExtractor
|
| 27 |
+
from .intel import CompetitorIntelAgent, DEEP_INTEL_CATEGORIES
|
| 28 |
+
except ImportError:
|
| 29 |
+
from config import RESEARCH_DIR, COMPETITORS, BATCH_QUERY_TEMPLATES, MARKET_QUERIES, LLM_ENABLED
|
| 30 |
+
from search import SearchService, ResultStorage
|
| 31 |
+
from extract import CompetitorExtractor
|
| 32 |
+
from intel import CompetitorIntelAgent, DEEP_INTEL_CATEGORIES
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ============================================================
|
| 36 |
+
# Commands
|
| 37 |
+
# ============================================================
|
| 38 |
+
|
| 39 |
+
def cmd_search(args):
|
| 40 |
+
service = SearchService(backend=args.backend)
|
| 41 |
+
print(f"Searching: {args.query}")
|
| 42 |
+
print(f"Backend: {args.backend} | Max: {args.max_results}")
|
| 43 |
+
print("-" * 50)
|
| 44 |
+
|
| 45 |
+
results = service.search(args.query, args.max_results, save=args.save)
|
| 46 |
+
|
| 47 |
+
for i, r in enumerate(results, 1):
|
| 48 |
+
print(f"\n{i}. {r.title}")
|
| 49 |
+
print(f" {r.url}")
|
| 50 |
+
print(f" {r.snippet[:150]}...")
|
| 51 |
+
|
| 52 |
+
print(f"\n[{len(results)} results]")
|
| 53 |
+
if args.save:
|
| 54 |
+
print(f"Saved to: {RESEARCH_DIR}")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def cmd_batch(args):
|
| 58 |
+
service = SearchService(backend=args.backend)
|
| 59 |
+
storage = ResultStorage()
|
| 60 |
+
|
| 61 |
+
queries = []
|
| 62 |
+
for company in COMPETITORS:
|
| 63 |
+
for template in BATCH_QUERY_TEMPLATES:
|
| 64 |
+
queries.append(template.format(company=company))
|
| 65 |
+
queries.extend(MARKET_QUERIES)
|
| 66 |
+
|
| 67 |
+
total_queries = len(queries)
|
| 68 |
+
|
| 69 |
+
skipped = 0
|
| 70 |
+
if not args.force:
|
| 71 |
+
recent = storage.get_recent_queries(days=args.days)
|
| 72 |
+
original_count = len(queries)
|
| 73 |
+
queries = [q for q in queries if q.lower().strip() not in recent]
|
| 74 |
+
skipped = original_count - len(queries)
|
| 75 |
+
|
| 76 |
+
print(f"Batch Research")
|
| 77 |
+
print(f"{'='*60}")
|
| 78 |
+
print(f"Competitors: {len(COMPETITORS)}")
|
| 79 |
+
print(f"Total queries: {total_queries}")
|
| 80 |
+
if skipped > 0:
|
| 81 |
+
print(f"Skipped (run in last {args.days} days): {skipped}")
|
| 82 |
+
print(f"New queries to run: {len(queries)}")
|
| 83 |
+
print(f"Output: {RESEARCH_DIR}")
|
| 84 |
+
print(f"{'='*60}")
|
| 85 |
+
|
| 86 |
+
if not queries:
|
| 87 |
+
print("\nNo new queries to run. Use --force to re-run all.")
|
| 88 |
+
return
|
| 89 |
+
|
| 90 |
+
def progress(i, total, query):
|
| 91 |
+
print(f"\n[{i}/{total}] {query}")
|
| 92 |
+
|
| 93 |
+
stats = service.search_batch(queries, args.max_results, args.delay, callback=progress)
|
| 94 |
+
|
| 95 |
+
success = sum(1 for v in stats.values() if v >= 0)
|
| 96 |
+
print(f"\n{'='*60}")
|
| 97 |
+
print(f"Complete: {success}/{len(queries)} successful")
|
| 98 |
+
if skipped > 0:
|
| 99 |
+
print(f"Skipped: {skipped} (already run recently)")
|
| 100 |
+
print(f"{'='*60}")
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def cmd_competitor(args):
|
| 104 |
+
company = args.company
|
| 105 |
+
use_external_llm = args.external_llm
|
| 106 |
+
|
| 107 |
+
if use_external_llm and not LLM_ENABLED:
|
| 108 |
+
print("Warning: --external-llm requested but OPENROUTER_API_KEY not found. Skipping external LLM.")
|
| 109 |
+
use_external_llm = False
|
| 110 |
+
|
| 111 |
+
categories = None
|
| 112 |
+
if args.categories:
|
| 113 |
+
categories = [c.strip() for c in args.categories.split(",")]
|
| 114 |
+
valid = set(DEEP_INTEL_CATEGORIES.keys())
|
| 115 |
+
invalid = [c for c in categories if c not in valid]
|
| 116 |
+
if invalid:
|
| 117 |
+
print(f"Invalid categories: {invalid}")
|
| 118 |
+
print(f"Valid: {sorted(valid)}")
|
| 119 |
+
return
|
| 120 |
+
|
| 121 |
+
if args.list_categories:
|
| 122 |
+
print("Available categories:")
|
| 123 |
+
for key, cat in DEEP_INTEL_CATEGORIES.items():
|
| 124 |
+
q_count = len(cat["queries"])
|
| 125 |
+
print(f" {key:30s} {cat['label']:30s} ({q_count} queries)")
|
| 126 |
+
return
|
| 127 |
+
|
| 128 |
+
agent = CompetitorIntelAgent(company)
|
| 129 |
+
report_path = agent.run(
|
| 130 |
+
categories=categories,
|
| 131 |
+
use_external_llm=use_external_llm,
|
| 132 |
+
delay=args.delay,
|
| 133 |
+
max_results=args.max_results,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
print(f"\nReport: {report_path}")
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def cmd_extract(args):
|
| 140 |
+
extractor = CompetitorExtractor()
|
| 141 |
+
|
| 142 |
+
print(f"Loading research from: {extractor.research_dir}")
|
| 143 |
+
data = extractor.process()
|
| 144 |
+
|
| 145 |
+
if not data["competitors"]:
|
| 146 |
+
print("No research files found. Run 'batch' first.")
|
| 147 |
+
return
|
| 148 |
+
|
| 149 |
+
output = extractor.save(data)
|
| 150 |
+
|
| 151 |
+
print(f"Saved to: {output}")
|
| 152 |
+
print(f"\nCompany mentions:")
|
| 153 |
+
for comp in data["competitors"]:
|
| 154 |
+
status_marker = {"collapsed": "⚠", "weak": "↓", "growing": "↑", "strong": "★"}.get(comp["status"], "•")
|
| 155 |
+
print(f" {status_marker} {comp['name']}: {comp['mentions']} mentions ({comp['status']})")
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def cmd_sota(args):
|
| 159 |
+
try:
|
| 160 |
+
from .sota_agent import SOTAScoutAgent
|
| 161 |
+
except ImportError:
|
| 162 |
+
from sota_agent import SOTAScoutAgent
|
| 163 |
+
|
| 164 |
+
agent = SOTAScoutAgent()
|
| 165 |
+
|
| 166 |
+
if args.analyze:
|
| 167 |
+
report = agent.analyze(args.analyze)
|
| 168 |
+
print(f"\nAnalysis report: {report}")
|
| 169 |
+
return
|
| 170 |
+
|
| 171 |
+
agent.show(section=args.show)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def cmd_mali(args):
|
| 175 |
+
try:
|
| 176 |
+
from .tr_agents import MaliMusavirAgent
|
| 177 |
+
except ImportError:
|
| 178 |
+
from tr_agents import MaliMusavirAgent
|
| 179 |
+
|
| 180 |
+
agent = MaliMusavirAgent()
|
| 181 |
+
|
| 182 |
+
if args.list_categories:
|
| 183 |
+
agent.list_categories()
|
| 184 |
+
return
|
| 185 |
+
|
| 186 |
+
categories = None
|
| 187 |
+
if args.categories:
|
| 188 |
+
categories = [c.strip() for c in args.categories.split(",")]
|
| 189 |
+
valid = set(agent.CATEGORIES.keys())
|
| 190 |
+
invalid = [c for c in categories if c not in valid]
|
| 191 |
+
if invalid:
|
| 192 |
+
print(f"Geçersiz kategoriler: {invalid}")
|
| 193 |
+
print(f"Geçerli: {sorted(valid)}")
|
| 194 |
+
return
|
| 195 |
+
|
| 196 |
+
report_path = agent.run(
|
| 197 |
+
categories=categories,
|
| 198 |
+
delay=args.delay,
|
| 199 |
+
max_results=args.max_results,
|
| 200 |
+
)
|
| 201 |
+
print(f"\nRapor: {report_path}")
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def cmd_fonlar(args):
|
| 205 |
+
try:
|
| 206 |
+
from .tr_agents import FonArastirmaAgent
|
| 207 |
+
except ImportError:
|
| 208 |
+
from tr_agents import FonArastirmaAgent
|
| 209 |
+
|
| 210 |
+
agent = FonArastirmaAgent()
|
| 211 |
+
|
| 212 |
+
if args.list_categories:
|
| 213 |
+
agent.list_categories()
|
| 214 |
+
return
|
| 215 |
+
|
| 216 |
+
categories = None
|
| 217 |
+
if args.categories:
|
| 218 |
+
categories = [c.strip() for c in args.categories.split(",")]
|
| 219 |
+
valid = set(agent.CATEGORIES.keys())
|
| 220 |
+
invalid = [c for c in categories if c not in valid]
|
| 221 |
+
if invalid:
|
| 222 |
+
print(f"Geçersiz kategoriler: {invalid}")
|
| 223 |
+
print(f"Geçerli: {sorted(valid)}")
|
| 224 |
+
return
|
| 225 |
+
|
| 226 |
+
report_path = agent.run(
|
| 227 |
+
categories=categories,
|
| 228 |
+
delay=args.delay,
|
| 229 |
+
max_results=args.max_results,
|
| 230 |
+
)
|
| 231 |
+
print(f"\nRapor: {report_path}")
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def cmd_list(args):
|
| 235 |
+
storage = ResultStorage()
|
| 236 |
+
searches = storage.list_searches(args.limit)
|
| 237 |
+
|
| 238 |
+
if not searches:
|
| 239 |
+
print(f"No searches in {RESEARCH_DIR}")
|
| 240 |
+
return
|
| 241 |
+
|
| 242 |
+
print(f"Recent searches ({RESEARCH_DIR}):\n")
|
| 243 |
+
for s in searches:
|
| 244 |
+
print(f" {s['timestamp'][:10]} {s['results']:2d} results {s['query'][:50]}")
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
# ============================================================
|
| 248 |
+
# Argparse
|
| 249 |
+
# ============================================================
|
| 250 |
+
|
| 251 |
+
def main():
|
| 252 |
+
parser = argparse.ArgumentParser(
|
| 253 |
+
description="GURMA.ai Research Tool",
|
| 254 |
+
formatter_class=argparse.RawDescriptionHelpFormatter
|
| 255 |
+
)
|
| 256 |
+
subparsers = parser.add_subparsers(dest="command", help="Commands")
|
| 257 |
+
|
| 258 |
+
# search
|
| 259 |
+
p_search = subparsers.add_parser("search", help="Single web search")
|
| 260 |
+
p_search.add_argument("query", help="Search query")
|
| 261 |
+
p_search.add_argument("-b", "--backend", default="duckduckgo",
|
| 262 |
+
choices=["duckduckgo", "ddg", "serpapi", "brave"])
|
| 263 |
+
p_search.add_argument("-n", "--max-results", type=int, default=10)
|
| 264 |
+
p_search.add_argument("--no-save", dest="save", action="store_false")
|
| 265 |
+
p_search.set_defaults(func=cmd_search)
|
| 266 |
+
|
| 267 |
+
# batch
|
| 268 |
+
p_batch = subparsers.add_parser("batch", help="Batch research all competitors")
|
| 269 |
+
p_batch.add_argument("-b", "--backend", default="duckduckgo")
|
| 270 |
+
p_batch.add_argument("-n", "--max-results", type=int, default=10)
|
| 271 |
+
p_batch.add_argument("-d", "--delay", type=float, default=0.5)
|
| 272 |
+
p_batch.add_argument("--days", type=int, default=7,
|
| 273 |
+
help="Skip queries run within N days (default: 7)")
|
| 274 |
+
p_batch.add_argument("-f", "--force", action="store_true",
|
| 275 |
+
help="Force re-run all queries (ignore deduplication)")
|
| 276 |
+
p_batch.set_defaults(func=cmd_batch)
|
| 277 |
+
|
| 278 |
+
# competitor (deep intel)
|
| 279 |
+
p_comp = subparsers.add_parser("competitor", help="Deep competitive intelligence on a company")
|
| 280 |
+
p_comp.add_argument("company", nargs="?", default="", help="Company name (e.g. 'Ekso Bionics')")
|
| 281 |
+
p_comp.add_argument("--external-llm", action="store_true",
|
| 282 |
+
help="Also use external LLM (OpenRouter) for enhanced analysis")
|
| 283 |
+
p_comp.add_argument("-c", "--categories", type=str, default=None,
|
| 284 |
+
help="Comma-separated categories (default: all)")
|
| 285 |
+
p_comp.add_argument("--list-categories", action="store_true",
|
| 286 |
+
help="List available categories")
|
| 287 |
+
p_comp.add_argument("-n", "--max-results", type=int, default=10)
|
| 288 |
+
p_comp.add_argument("-d", "--delay", type=float, default=1.0,
|
| 289 |
+
help="Delay between searches in seconds (default: 1.0)")
|
| 290 |
+
p_comp.set_defaults(func=cmd_competitor)
|
| 291 |
+
|
| 292 |
+
# sota
|
| 293 |
+
p_sota = subparsers.add_parser("sota", help="SOTA technology knowledge base for GURMA.ai")
|
| 294 |
+
p_sota.add_argument("--analyze", "-a", type=str, default=None,
|
| 295 |
+
help="Analyze a document and update knowledge base")
|
| 296 |
+
p_sota.add_argument("--show", "-s", type=str, default=None, nargs="?",
|
| 297 |
+
const=None,
|
| 298 |
+
choices=["models", "techniques", "stack", "principles", "actions", "sources"],
|
| 299 |
+
help="Show specific KB section (default: summary)")
|
| 300 |
+
p_sota.set_defaults(func=cmd_sota)
|
| 301 |
+
|
| 302 |
+
# mali (Turkish company formation)
|
| 303 |
+
p_mali = subparsers.add_parser("mali", help="Türkiye şirket kuruluşu araştırması")
|
| 304 |
+
p_mali.add_argument("-c", "--categories", type=str, default=None,
|
| 305 |
+
help="Virgülle ayrılmış kategoriler (varsayılan: tümü)")
|
| 306 |
+
p_mali.add_argument("--list-categories", action="store_true",
|
| 307 |
+
help="Mevcut kategorileri listele")
|
| 308 |
+
p_mali.add_argument("-n", "--max-results", type=int, default=10)
|
| 309 |
+
p_mali.add_argument("-d", "--delay", type=float, default=1.0)
|
| 310 |
+
p_mali.set_defaults(func=cmd_mali)
|
| 311 |
+
|
| 312 |
+
# fonlar (Turkish government funding research)
|
| 313 |
+
p_fonlar = subparsers.add_parser("fonlar", help="TÜBİTAK ve devlet fonları araştırması")
|
| 314 |
+
p_fonlar.add_argument("-c", "--categories", type=str, default=None,
|
| 315 |
+
help="Virgülle ayrılmış kategoriler (varsayılan: tümü)")
|
| 316 |
+
p_fonlar.add_argument("--list-categories", action="store_true",
|
| 317 |
+
help="Mevcut kategorileri listele")
|
| 318 |
+
p_fonlar.add_argument("-n", "--max-results", type=int, default=10)
|
| 319 |
+
p_fonlar.add_argument("-d", "--delay", type=float, default=1.0)
|
| 320 |
+
p_fonlar.set_defaults(func=cmd_fonlar)
|
| 321 |
+
|
| 322 |
+
# extract
|
| 323 |
+
p_extract = subparsers.add_parser("extract", help="Extract competitor data to JSON")
|
| 324 |
+
p_extract.set_defaults(func=cmd_extract)
|
| 325 |
+
|
| 326 |
+
# list
|
| 327 |
+
p_list = subparsers.add_parser("list", help="List saved searches")
|
| 328 |
+
p_list.add_argument("-l", "--limit", type=int, default=20)
|
| 329 |
+
p_list.set_defaults(func=cmd_list)
|
| 330 |
+
|
| 331 |
+
args = parser.parse_args()
|
| 332 |
+
|
| 333 |
+
if hasattr(args, "func"):
|
| 334 |
+
args.func(args)
|
| 335 |
+
else:
|
| 336 |
+
parser.print_help()
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
if __name__ == "__main__":
|
| 340 |
+
main()
|
config.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GURMA.ai shared configuration.
|
| 3 |
+
|
| 4 |
+
Environment detection, directory paths, API keys, and research constants
|
| 5 |
+
used across all agents and the dashboard.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# ============================================================
|
| 15 |
+
# Environment Detection
|
| 16 |
+
# ============================================================
|
| 17 |
+
|
| 18 |
+
def _detect_project_root() -> Path:
|
| 19 |
+
"""Detect project root based on environment."""
|
| 20 |
+
if os.getenv("HF_SPACE") or Path("/app/research.py").exists():
|
| 21 |
+
return Path("/app")
|
| 22 |
+
return Path(__file__).parent.parent.parent
|
| 23 |
+
|
| 24 |
+
PROJECT_ROOT = _detect_project_root()
|
| 25 |
+
IS_HF_SPACE = PROJECT_ROOT == Path("/app")
|
| 26 |
+
|
| 27 |
+
if not IS_HF_SPACE:
|
| 28 |
+
try:
|
| 29 |
+
from dotenv import load_dotenv
|
| 30 |
+
load_dotenv(PROJECT_ROOT / ".env")
|
| 31 |
+
except ImportError:
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ============================================================
|
| 36 |
+
# Directories
|
| 37 |
+
# ============================================================
|
| 38 |
+
|
| 39 |
+
if IS_HF_SPACE:
|
| 40 |
+
RESEARCH_DIR = PROJECT_ROOT / "data"
|
| 41 |
+
DATA_DIR = PROJECT_ROOT / "data"
|
| 42 |
+
else:
|
| 43 |
+
RESEARCH_DIR = PROJECT_ROOT / "data"
|
| 44 |
+
DATA_DIR = PROJECT_ROOT / "src" / "dashboard"
|
| 45 |
+
|
| 46 |
+
RESEARCH_DIR.mkdir(parents=True, exist_ok=True)
|
| 47 |
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# ============================================================
|
| 51 |
+
# API Keys & LLM Config
|
| 52 |
+
# ============================================================
|
| 53 |
+
|
| 54 |
+
SERPAPI_KEY = os.getenv("SERPAPI_KEY")
|
| 55 |
+
BRAVE_API_KEY = os.getenv("BRAVE_API_KEY")
|
| 56 |
+
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
| 57 |
+
|
| 58 |
+
LLM_MODEL = "deepseek/deepseek-chat"
|
| 59 |
+
LLM_ENABLED = bool(OPENROUTER_API_KEY)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ============================================================
|
| 63 |
+
# Research Constants
|
| 64 |
+
# ============================================================
|
| 65 |
+
|
| 66 |
+
COMPETITORS = [
|
| 67 |
+
"Hocoma", "Ekso Bionics", "Lifeward ReWalk", "Fourier Intelligence",
|
| 68 |
+
"Cyberdyne HAL", "Wandercraft", "Myomo", "Bionik",
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
BATCH_QUERY_TEMPLATES = [
|
| 72 |
+
"{company} latest news 2025 2026",
|
| 73 |
+
"{company} funding investors valuation",
|
| 74 |
+
"{company} FDA approval regulatory",
|
| 75 |
+
"{company} partnerships collaborations",
|
| 76 |
+
"{company} AI machine learning technology",
|
| 77 |
+
"site:accessdata.fda.gov {company}",
|
| 78 |
+
"site:clinicaltrials.gov {company} rehabilitation",
|
| 79 |
+
"site:crunchbase.com {company}",
|
| 80 |
+
"site:sec.gov {company} 10-K OR 8-K",
|
| 81 |
+
"site:patents.google.com {company} exoskeleton OR rehabilitation",
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
MARKET_QUERIES = [
|
| 85 |
+
"rehabilitation robotics market size 2026 forecast",
|
| 86 |
+
"exoskeleton market growth AI integration",
|
| 87 |
+
"rehabilitation robotics insurance reimbursement",
|
| 88 |
+
"medical exoskeleton FDA approval 2025",
|
| 89 |
+
"stroke rehabilitation AI technology",
|
| 90 |
+
"spinal cord injury exoskeleton treatment",
|
| 91 |
+
"rehabilitation robotics competitive landscape",
|
| 92 |
+
"site:exoskeletonreport.com 2025 2026",
|
| 93 |
+
"site:medgadget.com exoskeleton rehabilitation",
|
| 94 |
+
"site:fda.gov rehabilitation robotics guidance",
|
| 95 |
+
"MDR medical device regulation exoskeleton CE mark 2025",
|
| 96 |
+
"site:pubmed.ncbi.nlm.nih.gov rehabilitation robotics AI 2024 2025",
|
| 97 |
+
"exoskeleton insurance coverage CMS reimbursement code",
|
| 98 |
+
"rehabilitation robotics HCPCS code billing",
|
| 99 |
+
]
|
extract.py
ADDED
|
@@ -0,0 +1,537 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Competitor data extraction and opportunity detection.
|
| 3 |
+
|
| 4 |
+
Builds competitors.json from raw research files + deep intel findings,
|
| 5 |
+
detects market opportunities, and optionally synthesizes via LLM.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import re
|
| 12 |
+
from collections import defaultdict
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Optional
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
from .config import RESEARCH_DIR, DATA_DIR, LLM_ENABLED
|
| 19 |
+
from .llm import LLMClient
|
| 20 |
+
except ImportError:
|
| 21 |
+
from config import RESEARCH_DIR, DATA_DIR, LLM_ENABLED
|
| 22 |
+
from llm import LLMClient
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ============================================================
|
| 26 |
+
# Company Definitions & Extraction Patterns
|
| 27 |
+
# ============================================================
|
| 28 |
+
|
| 29 |
+
COMPANY_DEFINITIONS = {
|
| 30 |
+
"Hocoma": {"aliases": ["hocoma", "dih", "lokomat"], "country": "Switzerland", "product": "Lokomat", "status": "collapsed", "verified": True},
|
| 31 |
+
"Ekso Bionics": {"aliases": ["ekso", "eksobionics", "eksonr"], "country": "USA", "product": "EksoNR", "status": "weak", "verified": True},
|
| 32 |
+
"Cyberdyne": {"aliases": ["cyberdyne", "hal exoskeleton"], "country": "Japan", "product": "HAL", "status": "strong", "verified": True},
|
| 33 |
+
"Lifeward": {"aliases": ["lifeward", "rewalk", "alterg"], "country": "Israel/USA", "product": "ReWalk 7", "status": "consolidating", "verified": True},
|
| 34 |
+
"Fourier": {"aliases": ["fourier", "fourier intelligence"], "country": "China", "product": "X1, M2", "status": "growing", "verified": True},
|
| 35 |
+
"Myomo": {"aliases": ["myomo", "myopro"], "country": "USA", "product": "MyoPro", "status": "stable", "verified": False},
|
| 36 |
+
"Bionik": {"aliases": ["bionik", "inmotion"], "country": "Canada", "product": "InMotion", "status": "stable", "verified": False},
|
| 37 |
+
"Wandercraft": {"aliases": ["wandercraft", "atalante"], "country": "France", "product": "Atalante X", "status": "growing", "verified": False},
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
STATUS_KEYWORDS = [
|
| 41 |
+
("collapsed", ["bankrupt", "delisted", "suspended", "collapse", "shut down", "ceased", "nasdaq delisted"]),
|
| 42 |
+
("weak", ["52-week low", "struggling", "losses", "declining", "layoffs"]),
|
| 43 |
+
("growing", ["series e", "series d", "series c", "funding round", "$109 million"]),
|
| 44 |
+
("consolidating", ["acquired", "merger", "acquisition"]),
|
| 45 |
+
("strong", ["leader", "dominant", "profitable"]),
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
DATE_PATTERN = re.compile(
|
| 49 |
+
r'((?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4})'
|
| 50 |
+
r'|(\d{4}-\d{2}-\d{2})'
|
| 51 |
+
r'|(\d{4}-\d{2})'
|
| 52 |
+
r'|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4})'
|
| 53 |
+
)
|
| 54 |
+
MONEY_PATTERN = re.compile(r'\$[\d,]+(?:\.\d+)?(?:\s*(?:million|billion|M|B))?|\d+(?:\.\d+)?\s*(?:million|billion)', re.IGNORECASE)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# ============================================================
|
| 58 |
+
# Competitor Extractor
|
| 59 |
+
# ============================================================
|
| 60 |
+
|
| 61 |
+
class CompetitorExtractor:
|
| 62 |
+
"""Extract structured competitor data from research results."""
|
| 63 |
+
|
| 64 |
+
def __init__(self, research_dir: Path = RESEARCH_DIR,
|
| 65 |
+
output_file: Path = None):
|
| 66 |
+
self.research_dir = research_dir
|
| 67 |
+
self.output_file = output_file or (DATA_DIR / "competitors.json")
|
| 68 |
+
|
| 69 |
+
def load_research_files(self) -> list[dict]:
|
| 70 |
+
results = []
|
| 71 |
+
if not self.research_dir.exists():
|
| 72 |
+
return results
|
| 73 |
+
|
| 74 |
+
for json_file in self.research_dir.glob("*.json"):
|
| 75 |
+
if json_file.name.startswith("."):
|
| 76 |
+
continue
|
| 77 |
+
try:
|
| 78 |
+
with open(json_file) as f:
|
| 79 |
+
data = json.load(f)
|
| 80 |
+
data["_source_file"] = json_file.name
|
| 81 |
+
results.append(data)
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f"Error loading {json_file}: {e}")
|
| 84 |
+
|
| 85 |
+
return results
|
| 86 |
+
|
| 87 |
+
def find_mentions(self, text: str) -> list[str]:
|
| 88 |
+
text_lower = text.lower()
|
| 89 |
+
mentioned = []
|
| 90 |
+
for company, info in COMPANY_DEFINITIONS.items():
|
| 91 |
+
if any(alias in text_lower for alias in info["aliases"]):
|
| 92 |
+
mentioned.append(company)
|
| 93 |
+
return mentioned
|
| 94 |
+
|
| 95 |
+
def normalize_date(self, date_str: str) -> str | None:
|
| 96 |
+
formats = ["%B %d, %Y", "%B %d %Y", "%b %d, %Y", "%b %d %Y", "%Y-%m-%d", "%Y-%m"]
|
| 97 |
+
for fmt in formats:
|
| 98 |
+
try:
|
| 99 |
+
dt = datetime.strptime(date_str.strip(), fmt)
|
| 100 |
+
if dt.year < 2010:
|
| 101 |
+
return None
|
| 102 |
+
return dt.strftime("%Y-%m-%d")
|
| 103 |
+
except:
|
| 104 |
+
pass
|
| 105 |
+
return date_str
|
| 106 |
+
|
| 107 |
+
def extract_events(self, text: str, company: str) -> list[dict]:
|
| 108 |
+
events = []
|
| 109 |
+
aliases = COMPANY_DEFINITIONS[company]["aliases"]
|
| 110 |
+
|
| 111 |
+
for match in DATE_PATTERN.finditer(text):
|
| 112 |
+
date_str = match.group(0)
|
| 113 |
+
if not date_str:
|
| 114 |
+
continue
|
| 115 |
+
|
| 116 |
+
start = max(0, match.start() - 50)
|
| 117 |
+
end = min(len(text), match.end() + 150)
|
| 118 |
+
context = text[start:end]
|
| 119 |
+
|
| 120 |
+
normalized = self.normalize_date(date_str)
|
| 121 |
+
if normalized and any(alias in context.lower() for alias in aliases):
|
| 122 |
+
events.append({
|
| 123 |
+
"date": normalized,
|
| 124 |
+
"context": context.strip()
|
| 125 |
+
})
|
| 126 |
+
|
| 127 |
+
return events
|
| 128 |
+
|
| 129 |
+
def detect_status(self, snippets: list[str], default: str) -> str:
|
| 130 |
+
text = " ".join(snippets).lower()
|
| 131 |
+
for status, keywords in STATUS_KEYWORDS:
|
| 132 |
+
if any(kw.lower() in text for kw in keywords):
|
| 133 |
+
return status
|
| 134 |
+
return default
|
| 135 |
+
|
| 136 |
+
def extract_stock(self, snippets: list[str]) -> Optional[float]:
|
| 137 |
+
for snippet in snippets:
|
| 138 |
+
match = re.search(r'\$(\d+\.?\d*)', snippet)
|
| 139 |
+
if match and float(match.group(1)) < 1000:
|
| 140 |
+
return float(match.group(1))
|
| 141 |
+
return None
|
| 142 |
+
|
| 143 |
+
def extract_funding(self, money_mentions: list[str]) -> Optional[int]:
|
| 144 |
+
for m in money_mentions:
|
| 145 |
+
match = re.search(r'(\d+)\s*(?:million|M)', m, re.IGNORECASE)
|
| 146 |
+
if match:
|
| 147 |
+
return int(match.group(1)) * 1_000_000
|
| 148 |
+
match = re.search(r'(\d+\.?\d*)\s*(?:billion|B)', m, re.IGNORECASE)
|
| 149 |
+
if match:
|
| 150 |
+
return int(float(match.group(1)) * 1_000_000_000)
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
def _load_intel_findings(self) -> dict[str, list[dict]]:
|
| 154 |
+
"""Load confirmed findings from Deep Intel reports, grouped by company."""
|
| 155 |
+
intel_dir = self.research_dir / "intel"
|
| 156 |
+
if not intel_dir.exists():
|
| 157 |
+
return {}
|
| 158 |
+
|
| 159 |
+
findings_by_company: dict[str, list[dict]] = {}
|
| 160 |
+
seen_companies: set[str] = set()
|
| 161 |
+
|
| 162 |
+
for json_file in sorted(intel_dir.glob("*_intel.json"), reverse=True):
|
| 163 |
+
try:
|
| 164 |
+
with open(json_file) as f:
|
| 165 |
+
data = json.load(f)
|
| 166 |
+
company = data.get("company", "")
|
| 167 |
+
if not company or company in seen_companies:
|
| 168 |
+
continue
|
| 169 |
+
seen_companies.add(company)
|
| 170 |
+
|
| 171 |
+
all_findings = []
|
| 172 |
+
for section in data.get("sections", {}).values():
|
| 173 |
+
for finding in section.get("findings", []):
|
| 174 |
+
if isinstance(finding, dict) and finding.get("text"):
|
| 175 |
+
all_findings.append(finding)
|
| 176 |
+
elif isinstance(finding, str) and finding:
|
| 177 |
+
all_findings.append({"text": finding, "confirmed": False, "source": ""})
|
| 178 |
+
|
| 179 |
+
if all_findings:
|
| 180 |
+
findings_by_company[company] = all_findings
|
| 181 |
+
except Exception:
|
| 182 |
+
pass
|
| 183 |
+
|
| 184 |
+
return findings_by_company
|
| 185 |
+
|
| 186 |
+
def _extract_intel_opportunities(self, intel_findings: dict[str, list[dict]]) -> list[dict]:
|
| 187 |
+
"""Extract opportunity signals from Deep Intel confirmed findings."""
|
| 188 |
+
opportunities = []
|
| 189 |
+
|
| 190 |
+
vuln_patterns = [
|
| 191 |
+
(r'(?:layoff|restructur|downsiz|headcount.?reduc)', "workforce_cut", 2),
|
| 192 |
+
(r'(?:delist|stock.?(?:drop|fall|declin)|52.week.low|penny.stock)', "financial_distress", 1),
|
| 193 |
+
(r'(?:FDA.?(?:reject|warning|recall)|regulatory.?(?:issue|fail|delay))', "regulatory_issue", 2),
|
| 194 |
+
(r'(?:bankrupt|insolvenc|cease.?operat|wind.?down|liquidat)', "collapse", 1),
|
| 195 |
+
(r'(?:customer.?complain|negative.?review|churn|losing.?customer)', "customer_risk", 2),
|
| 196 |
+
(r'(?:legacy|technical.?debt|outdated|proprietary.?lock)', "tech_weakness", 3),
|
| 197 |
+
(r'(?:no.?AI|lack.?(?:of.?)?(?:data|machine.learn|personali))', "ai_gap", 2),
|
| 198 |
+
]
|
| 199 |
+
|
| 200 |
+
for company, findings in intel_findings.items():
|
| 201 |
+
confirmed = [f for f in findings if f.get("confirmed")]
|
| 202 |
+
all_text = " ".join(f["text"] for f in confirmed).lower() if confirmed else ""
|
| 203 |
+
all_text_full = " ".join(f["text"] for f in findings).lower()
|
| 204 |
+
|
| 205 |
+
for pattern, opp_type, priority in vuln_patterns:
|
| 206 |
+
if re.search(pattern, all_text, re.IGNORECASE):
|
| 207 |
+
match_finding = next(
|
| 208 |
+
(f for f in confirmed if re.search(pattern, f["text"], re.IGNORECASE)),
|
| 209 |
+
None
|
| 210 |
+
)
|
| 211 |
+
if match_finding:
|
| 212 |
+
opportunities.append({
|
| 213 |
+
"type": opp_type,
|
| 214 |
+
"text": f"{company}: {match_finding['text'][:120]}",
|
| 215 |
+
"priority": priority,
|
| 216 |
+
"confirmed": True,
|
| 217 |
+
"source": match_finding.get("source", ""),
|
| 218 |
+
"company": company,
|
| 219 |
+
})
|
| 220 |
+
elif re.search(pattern, all_text_full, re.IGNORECASE):
|
| 221 |
+
match_finding = next(
|
| 222 |
+
(f for f in findings if re.search(pattern, f["text"], re.IGNORECASE)),
|
| 223 |
+
None
|
| 224 |
+
)
|
| 225 |
+
if match_finding:
|
| 226 |
+
opportunities.append({
|
| 227 |
+
"type": opp_type,
|
| 228 |
+
"text": f"{company}: {match_finding['text'][:120]}",
|
| 229 |
+
"priority": priority + 1,
|
| 230 |
+
"confirmed": False,
|
| 231 |
+
"source": match_finding.get("source", ""),
|
| 232 |
+
"company": company,
|
| 233 |
+
})
|
| 234 |
+
|
| 235 |
+
return opportunities
|
| 236 |
+
|
| 237 |
+
def _load_sota_tech_signals(self) -> list[dict]:
|
| 238 |
+
"""Load tech advantage signals from SOTA knowledge base."""
|
| 239 |
+
kb_path = self.research_dir / "sota" / "knowledge_base.json"
|
| 240 |
+
if not kb_path.exists():
|
| 241 |
+
return []
|
| 242 |
+
|
| 243 |
+
try:
|
| 244 |
+
with open(kb_path) as f:
|
| 245 |
+
kb = json.load(f)
|
| 246 |
+
except Exception:
|
| 247 |
+
return []
|
| 248 |
+
|
| 249 |
+
signals = []
|
| 250 |
+
|
| 251 |
+
for t in kb.get("techniques", []):
|
| 252 |
+
if t.get("priority") == "high" and t.get("gurma_fit"):
|
| 253 |
+
signals.append({
|
| 254 |
+
"type": "tech_advantage",
|
| 255 |
+
"text": f"{t['name']}: {t['gurma_fit'][:120]}",
|
| 256 |
+
"priority": 2,
|
| 257 |
+
"confirmed": True,
|
| 258 |
+
"company": "GURMA",
|
| 259 |
+
})
|
| 260 |
+
|
| 261 |
+
for p in kb.get("key_principles", [])[:2]:
|
| 262 |
+
if p.get("principle"):
|
| 263 |
+
signals.append({
|
| 264 |
+
"type": "tech_principle",
|
| 265 |
+
"text": f"{p['principle']}: {p.get('detail', '')[:100]}",
|
| 266 |
+
"priority": 3,
|
| 267 |
+
"confirmed": True,
|
| 268 |
+
"company": "GURMA",
|
| 269 |
+
})
|
| 270 |
+
|
| 271 |
+
return signals
|
| 272 |
+
|
| 273 |
+
def _opportunity_changed(self, new_opps: list[dict], existing: dict) -> bool:
|
| 274 |
+
existing_points = set(existing.get("points", []))
|
| 275 |
+
new_points = set(o["text"] for o in new_opps[:4])
|
| 276 |
+
|
| 277 |
+
if not existing_points:
|
| 278 |
+
return True
|
| 279 |
+
|
| 280 |
+
new_p1_types = {o["type"] for o in new_opps if o["priority"] == 1}
|
| 281 |
+
old_raw = existing.get("raw_opportunities", [])
|
| 282 |
+
old_p1_types = {o["type"] for o in old_raw if o.get("priority") == 1}
|
| 283 |
+
if new_p1_types != old_p1_types:
|
| 284 |
+
return True
|
| 285 |
+
|
| 286 |
+
overlap = existing_points & new_points
|
| 287 |
+
if len(overlap) < len(existing_points) / 2:
|
| 288 |
+
return True
|
| 289 |
+
|
| 290 |
+
return False
|
| 291 |
+
|
| 292 |
+
def _synthesize_opportunity_llm(self, opportunities: list[dict],
|
| 293 |
+
competitors: list[dict]) -> Optional[dict]:
|
| 294 |
+
if not LLM_ENABLED:
|
| 295 |
+
return None
|
| 296 |
+
|
| 297 |
+
llm = LLMClient()
|
| 298 |
+
|
| 299 |
+
opp_text = "\n".join(
|
| 300 |
+
f"- [{o['type']}] {'[CONFIRMED]' if o.get('confirmed') else '[SPECULATIVE]'} {o['text']}"
|
| 301 |
+
for o in opportunities[:12]
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
comp_summary = "\n".join(
|
| 305 |
+
f"- {c['name']}: status={c['status']}, "
|
| 306 |
+
f"{'stock=$'+format(c['stock'], '.2f') if c.get('stock') else 'no stock data'}, "
|
| 307 |
+
f"{'funding=$'+format(c['funding']/1e6, '.0f')+'M' if c.get('funding') else 'no funding data'}"
|
| 308 |
+
for c in competitors[:8]
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
system = (
|
| 312 |
+
"You are a strategic advisor for GURMA.ai, a Swiss AI company "
|
| 313 |
+
"entering rehabilitation robotics with 15 years of patient outcome "
|
| 314 |
+
"data (not just motion data) from BAMA Teknoloji. "
|
| 315 |
+
"You produce concise, actionable strategic assessments."
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
prompt = f"""Based on the following competitive + technology signals and competitor data,
|
| 319 |
+
produce a strategic opportunity assessment for GURMA.ai.
|
| 320 |
+
|
| 321 |
+
Signals (competitive, tech advantages, and threats):
|
| 322 |
+
{opp_text}
|
| 323 |
+
|
| 324 |
+
Competitor landscape:
|
| 325 |
+
{comp_summary}
|
| 326 |
+
|
| 327 |
+
Return JSON:
|
| 328 |
+
{{
|
| 329 |
+
"headline": "One punchy sentence (max 10 words) summarizing the #1 strategic opportunity",
|
| 330 |
+
"points": [
|
| 331 |
+
"Actionable insight 1 (max 20 words, include numbers where available)",
|
| 332 |
+
"Actionable insight 2",
|
| 333 |
+
"Actionable insight 3",
|
| 334 |
+
"Actionable insight 4"
|
| 335 |
+
]
|
| 336 |
+
}}
|
| 337 |
+
|
| 338 |
+
Rules:
|
| 339 |
+
- Headline should be about the OPPORTUNITY, not just a competitor's problem
|
| 340 |
+
- Points should mix competitive windows, tech advantages, AND threats
|
| 341 |
+
- Be specific: include dollar amounts, dates, competitor names, model/technique names
|
| 342 |
+
- Maximum 4 points, ranked by strategic importance
|
| 343 |
+
- confirmed signals should be weighted more heavily than speculative ones"""
|
| 344 |
+
|
| 345 |
+
response = llm.call(prompt, system, max_tokens=500)
|
| 346 |
+
if response:
|
| 347 |
+
match = re.search(r'\{.*\}', response, re.DOTALL)
|
| 348 |
+
if match:
|
| 349 |
+
try:
|
| 350 |
+
result = json.loads(match.group())
|
| 351 |
+
if result.get("headline") and result.get("points"):
|
| 352 |
+
return result
|
| 353 |
+
except Exception:
|
| 354 |
+
pass
|
| 355 |
+
return None
|
| 356 |
+
|
| 357 |
+
def detect_opportunities(self, competitors: list[dict], all_snippets: list[str]) -> dict:
|
| 358 |
+
"""Detect market opportunities from competitor data + Deep Intel findings."""
|
| 359 |
+
opportunities = []
|
| 360 |
+
|
| 361 |
+
collapsed = [c for c in competitors if c["status"] == "collapsed"]
|
| 362 |
+
weak = [c for c in competitors if c["status"] == "weak"]
|
| 363 |
+
|
| 364 |
+
if collapsed:
|
| 365 |
+
names = ", ".join(c["name"] for c in collapsed)
|
| 366 |
+
opportunities.append({
|
| 367 |
+
"type": "market_gap",
|
| 368 |
+
"text": f"{names} collapsed — customers seeking alternatives",
|
| 369 |
+
"priority": 1, "confirmed": True, "company": names,
|
| 370 |
+
})
|
| 371 |
+
|
| 372 |
+
if weak:
|
| 373 |
+
for c in weak:
|
| 374 |
+
opp_text = f"{c['name']} financially weak"
|
| 375 |
+
if c.get("stock"):
|
| 376 |
+
opp_text += f" (${c['stock']:.2f})"
|
| 377 |
+
opp_text += " — vulnerable to disruption"
|
| 378 |
+
opportunities.append({
|
| 379 |
+
"type": "weakness",
|
| 380 |
+
"text": opp_text,
|
| 381 |
+
"priority": 2, "confirmed": True, "company": c["name"],
|
| 382 |
+
})
|
| 383 |
+
|
| 384 |
+
growing = [c for c in competitors if c["status"] == "growing" and c.get("funding")]
|
| 385 |
+
for c in growing:
|
| 386 |
+
funding_m = c["funding"] / 1_000_000
|
| 387 |
+
opportunities.append({
|
| 388 |
+
"type": "threat",
|
| 389 |
+
"text": f"{c['name']} well-funded (${funding_m:.0f}M) — monitor closely",
|
| 390 |
+
"priority": 3, "confirmed": True, "company": c["name"],
|
| 391 |
+
})
|
| 392 |
+
|
| 393 |
+
if competitors:
|
| 394 |
+
opportunities.append({
|
| 395 |
+
"type": "advantage",
|
| 396 |
+
"text": "BAMA has 15 years outcome data vs. competitors' motion data",
|
| 397 |
+
"priority": 1, "confirmed": True, "company": "BAMA",
|
| 398 |
+
})
|
| 399 |
+
|
| 400 |
+
intel_findings = self._load_intel_findings()
|
| 401 |
+
if intel_findings:
|
| 402 |
+
intel_opps = self._extract_intel_opportunities(intel_findings)
|
| 403 |
+
existing_keys = {(o.get("company", ""), o["type"]) for o in opportunities}
|
| 404 |
+
for io in intel_opps:
|
| 405 |
+
key = (io.get("company", ""), io["type"])
|
| 406 |
+
if key not in existing_keys:
|
| 407 |
+
opportunities.append(io)
|
| 408 |
+
existing_keys.add(key)
|
| 409 |
+
|
| 410 |
+
sota_signals = self._load_sota_tech_signals()
|
| 411 |
+
if sota_signals:
|
| 412 |
+
existing_keys = {(o.get("company", ""), o["type"]) for o in opportunities}
|
| 413 |
+
for ts in sota_signals:
|
| 414 |
+
key = (ts.get("company", ""), ts["type"])
|
| 415 |
+
if key not in existing_keys:
|
| 416 |
+
opportunities.append(ts)
|
| 417 |
+
existing_keys.add(key)
|
| 418 |
+
|
| 419 |
+
opportunities.sort(key=lambda x: x["priority"])
|
| 420 |
+
|
| 421 |
+
llm_result = self._synthesize_opportunity_llm(opportunities, competitors)
|
| 422 |
+
|
| 423 |
+
if llm_result:
|
| 424 |
+
headline = llm_result["headline"]
|
| 425 |
+
points = llm_result["points"][:4]
|
| 426 |
+
else:
|
| 427 |
+
if collapsed:
|
| 428 |
+
headline = f"{collapsed[0]['name']} collapse creates market window"
|
| 429 |
+
elif weak:
|
| 430 |
+
headline = "Competitor weakness creates opportunity"
|
| 431 |
+
else:
|
| 432 |
+
headline = "Data advantage positions GURMA.ai for growth"
|
| 433 |
+
points = [o["text"] for o in opportunities[:4]]
|
| 434 |
+
|
| 435 |
+
sources = ["competitor"]
|
| 436 |
+
if intel_findings:
|
| 437 |
+
sources.append("intel")
|
| 438 |
+
if sota_signals:
|
| 439 |
+
sources.append("tech")
|
| 440 |
+
if llm_result:
|
| 441 |
+
sources.append("llm")
|
| 442 |
+
|
| 443 |
+
return {
|
| 444 |
+
"headline": headline,
|
| 445 |
+
"points": points,
|
| 446 |
+
"detected_at": datetime.now().strftime("%Y-%m-%d"),
|
| 447 |
+
"raw_opportunities": opportunities,
|
| 448 |
+
"sources": sources,
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
def load_existing_data(self) -> Optional[dict]:
|
| 452 |
+
if self.output_file.exists():
|
| 453 |
+
try:
|
| 454 |
+
with open(self.output_file) as f:
|
| 455 |
+
return json.load(f)
|
| 456 |
+
except:
|
| 457 |
+
pass
|
| 458 |
+
return None
|
| 459 |
+
|
| 460 |
+
def process(self) -> dict:
|
| 461 |
+
research_data = self.load_research_files()
|
| 462 |
+
if not research_data:
|
| 463 |
+
return {"competitors": [], "market": {}}
|
| 464 |
+
|
| 465 |
+
company_data = defaultdict(lambda: {
|
| 466 |
+
"mentions": 0, "snippets": [], "events": [], "money": [], "urls": []
|
| 467 |
+
})
|
| 468 |
+
|
| 469 |
+
for research in research_data:
|
| 470 |
+
for result in research.get("results", []):
|
| 471 |
+
text = f"{result.get('title', '')} {result.get('snippet', '')}"
|
| 472 |
+
url = result.get("url", "")
|
| 473 |
+
|
| 474 |
+
for company in self.find_mentions(text):
|
| 475 |
+
cd = company_data[company]
|
| 476 |
+
cd["mentions"] += 1
|
| 477 |
+
cd["snippets"].append(result.get("snippet", "")[:200])
|
| 478 |
+
cd["urls"].append(url)
|
| 479 |
+
cd["events"].extend(self.extract_events(text, company))
|
| 480 |
+
cd["money"].extend(MONEY_PATTERN.findall(text))
|
| 481 |
+
|
| 482 |
+
competitors = []
|
| 483 |
+
for company, info in COMPANY_DEFINITIONS.items():
|
| 484 |
+
data = company_data[company]
|
| 485 |
+
|
| 486 |
+
status = info["status"] if info.get("verified") else self.detect_status(data["snippets"], info["status"])
|
| 487 |
+
|
| 488 |
+
competitors.append({
|
| 489 |
+
"name": company,
|
| 490 |
+
"country": info["country"],
|
| 491 |
+
"product": info["product"],
|
| 492 |
+
"status": status,
|
| 493 |
+
"stock": self.extract_stock(data["snippets"]),
|
| 494 |
+
"funding": self.extract_funding(data["money"]),
|
| 495 |
+
"notes": data["snippets"][0] if data["snippets"] else "",
|
| 496 |
+
"mentions": data["mentions"],
|
| 497 |
+
"events": [{"date": e["date"], "event": e["context"][:100]} for e in data["events"][:10]],
|
| 498 |
+
"sample_urls": list(set(data["urls"]))[:5],
|
| 499 |
+
})
|
| 500 |
+
|
| 501 |
+
competitors.sort(key=lambda x: x["mentions"], reverse=True)
|
| 502 |
+
|
| 503 |
+
all_snippets = []
|
| 504 |
+
for company, data in company_data.items():
|
| 505 |
+
all_snippets.extend(data["snippets"])
|
| 506 |
+
new_opportunity = self.detect_opportunities(competitors, all_snippets)
|
| 507 |
+
|
| 508 |
+
existing = self.load_existing_data()
|
| 509 |
+
existing_opp = existing.get("opportunity", {}) if existing else {}
|
| 510 |
+
|
| 511 |
+
if existing_opp.get("confirmed"):
|
| 512 |
+
if self._opportunity_changed(new_opportunity.get("raw_opportunities", []), existing_opp):
|
| 513 |
+
opportunity = existing_opp
|
| 514 |
+
opportunity["update_available"] = True
|
| 515 |
+
opportunity["suggested_update"] = new_opportunity
|
| 516 |
+
else:
|
| 517 |
+
opportunity = existing_opp
|
| 518 |
+
opportunity["update_available"] = False
|
| 519 |
+
else:
|
| 520 |
+
opportunity = new_opportunity
|
| 521 |
+
opportunity["confirmed"] = False
|
| 522 |
+
opportunity["update_available"] = False
|
| 523 |
+
|
| 524 |
+
return {
|
| 525 |
+
"competitors": competitors,
|
| 526 |
+
"market": {"size_2024": 2_000_000_000, "size_2029_ai": 9_100_000_000, "cagr": 0.278},
|
| 527 |
+
"opportunity": opportunity,
|
| 528 |
+
"_generated": datetime.now().isoformat(),
|
| 529 |
+
"_source_files": [f.name for f in self.research_dir.glob("*.json") if not f.name.startswith(".")]
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
def save(self, data: dict = None) -> Path:
|
| 533 |
+
data = data or self.process()
|
| 534 |
+
self.output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 535 |
+
with open(self.output_file, "w") as f:
|
| 536 |
+
json.dump(data, f, indent=2)
|
| 537 |
+
return self.output_file
|
intel.py
ADDED
|
@@ -0,0 +1,508 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Deep competitive intelligence agent.
|
| 3 |
+
|
| 4 |
+
Runs structured research across categories for a single competitor,
|
| 5 |
+
producing markdown + JSON reports with [CONFIRMED]/[SPECULATIVE] tagging.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import re
|
| 12 |
+
import time
|
| 13 |
+
from dataclasses import dataclass, field
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
from .config import RESEARCH_DIR, LLM_ENABLED
|
| 19 |
+
from .search import SearchService, WebSearchResult
|
| 20 |
+
from .llm import LLMClient
|
| 21 |
+
except ImportError:
|
| 22 |
+
from config import RESEARCH_DIR, LLM_ENABLED
|
| 23 |
+
from search import SearchService, WebSearchResult
|
| 24 |
+
from llm import LLMClient
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ============================================================
|
| 28 |
+
# Intel Constants
|
| 29 |
+
# ============================================================
|
| 30 |
+
|
| 31 |
+
DEEP_INTEL_CATEGORIES = {
|
| 32 |
+
"company_overview": {
|
| 33 |
+
"label": "Company Overview",
|
| 34 |
+
"queries": [
|
| 35 |
+
"{company} founding history milestones",
|
| 36 |
+
"{company} CEO leadership team background",
|
| 37 |
+
"{company} funding rounds investors valuation",
|
| 38 |
+
"{company} employee count headcount growth",
|
| 39 |
+
],
|
| 40 |
+
},
|
| 41 |
+
"product_technology": {
|
| 42 |
+
"label": "Product & Technology",
|
| 43 |
+
"queries": [
|
| 44 |
+
"{company} exoskeleton rehabilitation robot product specifications",
|
| 45 |
+
"{company} AI machine learning technology capabilities",
|
| 46 |
+
"{company} new product launch release 2025 2026",
|
| 47 |
+
"{company} patent filings exoskeleton rehabilitation innovation",
|
| 48 |
+
"site:patents.google.com {company} exoskeleton OR rehabilitation",
|
| 49 |
+
],
|
| 50 |
+
},
|
| 51 |
+
"regulatory_clinical": {
|
| 52 |
+
"label": "Regulatory & Clinical",
|
| 53 |
+
"queries": [
|
| 54 |
+
"site:accessdata.fda.gov {company}",
|
| 55 |
+
"{company} FDA 510k clearance CE mark MDR approval",
|
| 56 |
+
"site:clinicaltrials.gov {company} rehabilitation",
|
| 57 |
+
"{company} clinical outcomes study peer-reviewed results",
|
| 58 |
+
],
|
| 59 |
+
},
|
| 60 |
+
"market_channels": {
|
| 61 |
+
"label": "Market & Channels",
|
| 62 |
+
"queries": [
|
| 63 |
+
"{company} hospital clinic installations customer base",
|
| 64 |
+
"{company} insurance reimbursement coverage CMS",
|
| 65 |
+
"{company} partnerships distributors resellers",
|
| 66 |
+
"{company} conference MEDICA ACRM CES 2025 2026",
|
| 67 |
+
],
|
| 68 |
+
},
|
| 69 |
+
"vulnerabilities_threats": {
|
| 70 |
+
"label": "Vulnerabilities & Threats",
|
| 71 |
+
"queries": [
|
| 72 |
+
"{company} weaknesses problems criticism recall",
|
| 73 |
+
"{company} layoffs restructuring financial difficulty",
|
| 74 |
+
"{company} Glassdoor employee reviews satisfaction",
|
| 75 |
+
"{company} rehabilitation robotics AI expansion strategy 2025 2026",
|
| 76 |
+
"{company} acquisitions mergers market share growth",
|
| 77 |
+
"site:sec.gov {company} 10-K OR 8-K",
|
| 78 |
+
],
|
| 79 |
+
},
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
PRIMARY_SOURCE_DOMAINS = {
|
| 83 |
+
"sec.gov", "fda.gov", "clinicaltrials.gov", "patents.google.com",
|
| 84 |
+
"accessdata.fda.gov",
|
| 85 |
+
"crunchbase.com", "tracxn.com", "pitchbook.com", "cbinsights.com",
|
| 86 |
+
"bloomberg.com", "reuters.com", "wsj.com", "finance.yahoo.com",
|
| 87 |
+
"wellfound.com",
|
| 88 |
+
"linkedin.com", "glassdoor.com",
|
| 89 |
+
"g2.com", "capterra.com", "trustpilot.com",
|
| 90 |
+
"therobotreport.com", "exoskeletonreport.com", "medgadget.com",
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
CATEGORY_EXPECTED = {
|
| 94 |
+
"company_overview": {
|
| 95 |
+
"founding_year": [r'(?:founded|established|incorporated|started)\s+(?:in\s+)?(\d{4})'],
|
| 96 |
+
"leadership": [r'(?:CEO|Chief Executive|CTO|CFO|President|Founder|Chairman|COO)'],
|
| 97 |
+
"funding": [r'\$[\d,.]+\s*(?:million|billion|M|B)', r'(?:series\s+[A-F]|seed|IPO|funding\s+round)'],
|
| 98 |
+
"employees": [r'(\d[\d,]*)\s*(?:employees|staff|headcount|team\s+members|workers)'],
|
| 99 |
+
},
|
| 100 |
+
"product_technology": {
|
| 101 |
+
"products": [r'(?:product|device|robot|exoskeleton|system)\s'],
|
| 102 |
+
"technology": [r'(?:AI|machine\s+learning|deep\s+learning|sensor|actuator|algorithm|neural)'],
|
| 103 |
+
"patents": [r'(?:patent|IP|intellectual\s+property|invention)'],
|
| 104 |
+
"recent_launches": [r'(?:launch|release|announc|unveil|introduc)\w*\s+.{0,30}(?:2025|2026)'],
|
| 105 |
+
},
|
| 106 |
+
"regulatory_clinical": {
|
| 107 |
+
"fda_clearance": [r'(?:510\(?k\)?|FDA.?clear|FDA.?approv|de\s*novo)'],
|
| 108 |
+
"ce_mark": [r'(?:CE.?mark|MDR|EU.?approv|notified.?body)'],
|
| 109 |
+
"clinical_trials": [r'(?:clinical.?trial|NCT\d|randomized|controlled.?study|peer.?review)'],
|
| 110 |
+
"clinical_outcomes": [r'(?:outcome|efficacy|recovery.?rate|improvement|functional.?score)'],
|
| 111 |
+
},
|
| 112 |
+
"market_channels": {
|
| 113 |
+
"installations": [r'(?:hospital|clinic|center|install|deploy|site)\s'],
|
| 114 |
+
"reimbursement": [r'(?:reimburse|insurance|CMS|Medicare|Medicaid|HCPCS|coverage|payer)'],
|
| 115 |
+
"partnerships": [r'(?:partner|alliance|collaborat|distribut|reseller|dealer)'],
|
| 116 |
+
"events": [r'(?:conference|MEDICA|ACRM|CES|expo|trade\s+show|summit)'],
|
| 117 |
+
},
|
| 118 |
+
"vulnerabilities_threats": {
|
| 119 |
+
"weaknesses": [r'(?:weakness|problem|challenge|struggle|fail|recall|warning)'],
|
| 120 |
+
"financial_stress": [r'(?:layoff|restructur|loss|declining|debt|delist|penny.stock)'],
|
| 121 |
+
"employee_sentiment": [r'(?:glassdoor|employee.?review|work.?culture|turnover)'],
|
| 122 |
+
"expansion": [r'(?:expansion|new.?market|acqui|merger|market.?share|growth.?strategy)'],
|
| 123 |
+
},
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
CATEGORY_SYNTHESIS_QUESTIONS = {
|
| 127 |
+
"company_overview": [
|
| 128 |
+
"Founding story and key milestones",
|
| 129 |
+
"Leadership team (backgrounds, medical device experience)",
|
| 130 |
+
"Funding history (rounds, investors, valuations)",
|
| 131 |
+
"Employee count and growth trajectory",
|
| 132 |
+
],
|
| 133 |
+
"product_technology": [
|
| 134 |
+
"Product catalog (devices, indications, patient populations)",
|
| 135 |
+
"AI / machine learning capabilities (data they train on, algorithms used)",
|
| 136 |
+
"Recent product launches and roadmap clues (last 12 months)",
|
| 137 |
+
"Patent portfolio and innovation direction",
|
| 138 |
+
"How does their technology compare to GURMA.ai's outcome-data approach?",
|
| 139 |
+
],
|
| 140 |
+
"regulatory_clinical": [
|
| 141 |
+
"FDA clearances (510(k) numbers, De Novo, dates)",
|
| 142 |
+
"CE mark / MDR status in Europe",
|
| 143 |
+
"Active clinical trials (ClinicalTrials.gov entries, endpoints)",
|
| 144 |
+
"Published clinical outcomes (peer-reviewed studies, recovery rates)",
|
| 145 |
+
"Reimbursement status (CMS, Medicare, private payer coverage)",
|
| 146 |
+
],
|
| 147 |
+
"market_channels": [
|
| 148 |
+
"Hospital and clinic installations (how many sites, which countries)",
|
| 149 |
+
"Insurance and reimbursement strategy (pricing, payer relationships)",
|
| 150 |
+
"Distribution partnerships and reseller network",
|
| 151 |
+
"Conference and KOL presence (MEDICA, ACRM, physician endorsements)",
|
| 152 |
+
],
|
| 153 |
+
"vulnerabilities_threats": [
|
| 154 |
+
"What are they bad at? (clinical limitations, missing indications)",
|
| 155 |
+
"Financial health (SEC filings, cash burn, stock trajectory)",
|
| 156 |
+
"Employee sentiment (Glassdoor, hiring patterns, layoffs)",
|
| 157 |
+
"Growth strategy (acquisitions, new markets, AI investments)",
|
| 158 |
+
"What could they do that would hurt GURMA.ai most?",
|
| 159 |
+
"Early warning signals to monitor",
|
| 160 |
+
],
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
# ============================================================
|
| 165 |
+
# Intel Agent
|
| 166 |
+
# ============================================================
|
| 167 |
+
|
| 168 |
+
@dataclass
|
| 169 |
+
class IntelSection:
|
| 170 |
+
category: str
|
| 171 |
+
label: str
|
| 172 |
+
queries_executed: list = field(default_factory=list)
|
| 173 |
+
results: list = field(default_factory=list)
|
| 174 |
+
findings: list = field(default_factory=list)
|
| 175 |
+
gaps: list = field(default_factory=list)
|
| 176 |
+
sources: list = field(default_factory=list)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
class CompetitorIntelAgent:
|
| 180 |
+
"""Deep competitive intelligence agent for a single competitor.
|
| 181 |
+
|
| 182 |
+
Usage:
|
| 183 |
+
agent = CompetitorIntelAgent("Ekso Bionics")
|
| 184 |
+
report = agent.run()
|
| 185 |
+
report = agent.run(use_external_llm=True)
|
| 186 |
+
"""
|
| 187 |
+
|
| 188 |
+
def __init__(self, company: str, search: SearchService = None, llm: LLMClient = None):
|
| 189 |
+
self.company = company
|
| 190 |
+
self.search = search or SearchService()
|
| 191 |
+
self.llm = llm or LLMClient()
|
| 192 |
+
self.sections: dict[str, IntelSection] = {}
|
| 193 |
+
self.output_dir = RESEARCH_DIR / "intel"
|
| 194 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 195 |
+
|
| 196 |
+
def run(self, categories: list[str] = None, use_external_llm: bool = False,
|
| 197 |
+
delay: float = 1.0, max_results: int = 10) -> Path:
|
| 198 |
+
cats = categories or list(DEEP_INTEL_CATEGORIES.keys())
|
| 199 |
+
|
| 200 |
+
total_queries = sum(
|
| 201 |
+
len(DEEP_INTEL_CATEGORIES[c]["queries"])
|
| 202 |
+
for c in cats if c in DEEP_INTEL_CATEGORIES
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
print(f"\n{'='*60}")
|
| 206 |
+
print(f"Deep Competitive Intelligence: {self.company}")
|
| 207 |
+
print(f"Categories: {len(cats)} | Queries: ~{total_queries}")
|
| 208 |
+
print(f"Analysis: built-in{' + external LLM' if use_external_llm and self.llm.enabled else ''}")
|
| 209 |
+
print(f"{'='*60}\n")
|
| 210 |
+
|
| 211 |
+
for cat_key in cats:
|
| 212 |
+
cat = DEEP_INTEL_CATEGORIES.get(cat_key)
|
| 213 |
+
if not cat:
|
| 214 |
+
print(f"[SKIP] Unknown category: {cat_key}")
|
| 215 |
+
continue
|
| 216 |
+
|
| 217 |
+
section = IntelSection(category=cat_key, label=cat["label"])
|
| 218 |
+
self._research_category(section, cat, use_external_llm, delay, max_results)
|
| 219 |
+
self.sections[cat_key] = section
|
| 220 |
+
|
| 221 |
+
report_path = self._generate_report(use_external_llm)
|
| 222 |
+
self._save_data()
|
| 223 |
+
|
| 224 |
+
print(f"\n{'='*60}")
|
| 225 |
+
print(f"Report: {report_path}")
|
| 226 |
+
total_findings = sum(len(s.findings) for s in self.sections.values())
|
| 227 |
+
total_gaps = sum(len(s.gaps) for s in self.sections.values())
|
| 228 |
+
print(f"Findings: {total_findings} | Gaps: {total_gaps}")
|
| 229 |
+
print(f"{'='*60}\n")
|
| 230 |
+
|
| 231 |
+
return report_path
|
| 232 |
+
|
| 233 |
+
def _research_category(self, section: IntelSection, cat: dict,
|
| 234 |
+
use_external_llm: bool, delay: float, max_results: int):
|
| 235 |
+
print(f"\n--- {section.label} ---")
|
| 236 |
+
|
| 237 |
+
queries = [q.format(company=self.company) for q in cat["queries"]]
|
| 238 |
+
|
| 239 |
+
if use_external_llm and self.llm.enabled:
|
| 240 |
+
extra = self.llm.generate_category_queries(self.company, section.label)
|
| 241 |
+
if extra:
|
| 242 |
+
queries.extend(extra)
|
| 243 |
+
print(f" [EXTERNAL LLM] +{len(extra)} additional queries")
|
| 244 |
+
|
| 245 |
+
for query in queries:
|
| 246 |
+
print(f" [SEARCH] {query}")
|
| 247 |
+
try:
|
| 248 |
+
results = self.search.search(query, max_results=max_results, save=True)
|
| 249 |
+
section.queries_executed.append(query)
|
| 250 |
+
section.results.extend(results)
|
| 251 |
+
for r in results:
|
| 252 |
+
if r.url and r.url not in section.sources:
|
| 253 |
+
section.sources.append(r.url)
|
| 254 |
+
print(f" -> {len(results)} results")
|
| 255 |
+
except Exception as e:
|
| 256 |
+
print(f" -> Error: {e}")
|
| 257 |
+
|
| 258 |
+
if delay > 0:
|
| 259 |
+
time.sleep(delay)
|
| 260 |
+
|
| 261 |
+
section.findings = self._analyze_section(section)
|
| 262 |
+
section.gaps = self._detect_gaps(section)
|
| 263 |
+
|
| 264 |
+
confirmed = sum(1 for f in section.findings if f.get("confirmed"))
|
| 265 |
+
speculative = len(section.findings) - confirmed
|
| 266 |
+
print(f" [ANALYSIS] {len(section.findings)} findings ({confirmed} confirmed, {speculative} speculative)")
|
| 267 |
+
if section.gaps:
|
| 268 |
+
print(f" [GAPS] {len(section.gaps)}: {', '.join(g['text'] for g in section.gaps[:3])}")
|
| 269 |
+
|
| 270 |
+
if self.llm.enabled and section.results:
|
| 271 |
+
print(f" [SYNTHESIS] Synthesizing {section.label}...")
|
| 272 |
+
synthesis = self.llm.synthesize_intel(
|
| 273 |
+
self.company, section.category, section.label, section.results
|
| 274 |
+
)
|
| 275 |
+
synth_findings = synthesis.get("findings", [])
|
| 276 |
+
synth_gaps = synthesis.get("gaps", [])
|
| 277 |
+
|
| 278 |
+
if synth_findings:
|
| 279 |
+
synth_sources = {f.get("source", "") for f in synth_findings if f.get("source")}
|
| 280 |
+
for bf in section.findings:
|
| 281 |
+
if bf.get("source") and bf["source"] not in synth_sources:
|
| 282 |
+
synth_findings.append(bf)
|
| 283 |
+
section.findings = synth_findings
|
| 284 |
+
|
| 285 |
+
for f in synth_findings:
|
| 286 |
+
if isinstance(f, dict):
|
| 287 |
+
tag = "[CONFIRMED]" if f.get("confirmed") else "[SPECULATIVE]"
|
| 288 |
+
print(f" {tag} {f.get('text', '')[:80]}")
|
| 289 |
+
|
| 290 |
+
existing_gaps = {g["text"].lower() for g in section.gaps}
|
| 291 |
+
for sg in synth_gaps:
|
| 292 |
+
gap_text = sg.get("text", sg) if isinstance(sg, dict) else sg
|
| 293 |
+
if gap_text.lower() not in existing_gaps:
|
| 294 |
+
section.gaps.append({"text": gap_text})
|
| 295 |
+
|
| 296 |
+
def _analyze_section(self, section: IntelSection) -> list[dict]:
|
| 297 |
+
findings = []
|
| 298 |
+
seen_keys = set()
|
| 299 |
+
aliases = self._get_aliases()
|
| 300 |
+
|
| 301 |
+
for r in section.results:
|
| 302 |
+
text_lower = f"{r.title} {r.snippet}".lower()
|
| 303 |
+
|
| 304 |
+
if not any(alias in text_lower for alias in aliases):
|
| 305 |
+
continue
|
| 306 |
+
|
| 307 |
+
dedup_key = re.sub(r'[^a-z0-9]', '', r.title.lower()[:50])
|
| 308 |
+
if dedup_key in seen_keys:
|
| 309 |
+
continue
|
| 310 |
+
seen_keys.add(dedup_key)
|
| 311 |
+
|
| 312 |
+
confirmed = self._is_primary_source(r.url)
|
| 313 |
+
|
| 314 |
+
title = r.title.strip()
|
| 315 |
+
snippet = r.snippet.strip()[:250]
|
| 316 |
+
finding_text = f"{title}: {snippet}" if snippet else title
|
| 317 |
+
|
| 318 |
+
findings.append({
|
| 319 |
+
"text": finding_text,
|
| 320 |
+
"source": r.url,
|
| 321 |
+
"confirmed": confirmed,
|
| 322 |
+
})
|
| 323 |
+
|
| 324 |
+
findings.sort(key=lambda f: (not f["confirmed"], -len(f["text"])))
|
| 325 |
+
return findings[:15]
|
| 326 |
+
|
| 327 |
+
def _is_primary_source(self, url: str) -> bool:
|
| 328 |
+
if not url:
|
| 329 |
+
return False
|
| 330 |
+
url_lower = url.lower()
|
| 331 |
+
|
| 332 |
+
for domain in PRIMARY_SOURCE_DOMAINS:
|
| 333 |
+
if domain in url_lower:
|
| 334 |
+
return True
|
| 335 |
+
|
| 336 |
+
for alias in self._get_aliases():
|
| 337 |
+
slug = alias.replace(" ", "")
|
| 338 |
+
if len(slug) >= 4 and slug in url_lower.split("/")[2] if len(url_lower.split("/")) > 2 else False:
|
| 339 |
+
return True
|
| 340 |
+
|
| 341 |
+
return False
|
| 342 |
+
|
| 343 |
+
def _detect_gaps(self, section: IntelSection) -> list[dict]:
|
| 344 |
+
expected = CATEGORY_EXPECTED.get(section.category, {})
|
| 345 |
+
if not expected:
|
| 346 |
+
return []
|
| 347 |
+
|
| 348 |
+
aliases = self._get_aliases()
|
| 349 |
+
relevant_text = " ".join(
|
| 350 |
+
f"{r.title} {r.snippet}"
|
| 351 |
+
for r in section.results
|
| 352 |
+
if any(a in f"{r.title} {r.snippet}".lower() for a in aliases)
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
if not relevant_text:
|
| 356 |
+
return [{"text": f"No relevant results found for {section.label}"}]
|
| 357 |
+
|
| 358 |
+
relevant_lower = relevant_text.lower()
|
| 359 |
+
gaps = []
|
| 360 |
+
for field_name, patterns in expected.items():
|
| 361 |
+
found = any(
|
| 362 |
+
re.search(p, relevant_lower, re.IGNORECASE)
|
| 363 |
+
for p in patterns
|
| 364 |
+
)
|
| 365 |
+
if not found:
|
| 366 |
+
label = field_name.replace("_", " ").replace("/", " / ")
|
| 367 |
+
gaps.append({"text": f"No data found for: {label}"})
|
| 368 |
+
|
| 369 |
+
return gaps
|
| 370 |
+
|
| 371 |
+
def _get_aliases(self) -> list[str]:
|
| 372 |
+
try:
|
| 373 |
+
from .extract import COMPANY_DEFINITIONS
|
| 374 |
+
except ImportError:
|
| 375 |
+
from extract import COMPANY_DEFINITIONS
|
| 376 |
+
info = COMPANY_DEFINITIONS.get(self.company, {})
|
| 377 |
+
aliases = info.get("aliases", [])
|
| 378 |
+
if not aliases:
|
| 379 |
+
aliases = [self.company.lower()]
|
| 380 |
+
return aliases
|
| 381 |
+
|
| 382 |
+
def _generate_report(self, use_external_llm: bool) -> Path:
|
| 383 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 384 |
+
slug = self.company.lower().replace(" ", "-").replace("/", "-")
|
| 385 |
+
slug = "".join(c for c in slug if c.isalnum() or c == "-")
|
| 386 |
+
|
| 387 |
+
report_path = self.output_dir / f"{timestamp}_{slug}_intel.md"
|
| 388 |
+
|
| 389 |
+
method = "Built-in analysis"
|
| 390 |
+
if LLM_ENABLED:
|
| 391 |
+
method += " + LLM synthesis (OpenRouter)"
|
| 392 |
+
if use_external_llm:
|
| 393 |
+
method += " + extra query generation"
|
| 394 |
+
|
| 395 |
+
lines = [
|
| 396 |
+
f"# Competitive Intelligence: {self.company}",
|
| 397 |
+
"",
|
| 398 |
+
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')} ",
|
| 399 |
+
f"**Method:** {method} ",
|
| 400 |
+
f"**Searches:** {sum(len(s.queries_executed) for s in self.sections.values())} ",
|
| 401 |
+
f"**Sources:** {sum(len(s.sources) for s in self.sections.values())} unique URLs",
|
| 402 |
+
"",
|
| 403 |
+
"> **Legend:** [CONFIRMED] = from primary/verified source | [SPECULATIVE] = inferred or unverified",
|
| 404 |
+
"",
|
| 405 |
+
"---",
|
| 406 |
+
]
|
| 407 |
+
|
| 408 |
+
for section in self.sections.values():
|
| 409 |
+
lines.append("")
|
| 410 |
+
lines.append(f"## {section.label}")
|
| 411 |
+
lines.append("")
|
| 412 |
+
|
| 413 |
+
if not section.findings:
|
| 414 |
+
lines.append("*No findings. Try broader queries or `--external-llm` for additional analysis.*")
|
| 415 |
+
lines.append("")
|
| 416 |
+
continue
|
| 417 |
+
|
| 418 |
+
for f in section.findings:
|
| 419 |
+
if isinstance(f, dict):
|
| 420 |
+
tag = "[CONFIRMED]" if f.get("confirmed") else "[SPECULATIVE]"
|
| 421 |
+
text = f.get("text", "")
|
| 422 |
+
source = f.get("source", "")
|
| 423 |
+
lines.append(f"- **{tag}** {text}")
|
| 424 |
+
if source:
|
| 425 |
+
lines.append(f" - Source: {source}")
|
| 426 |
+
else:
|
| 427 |
+
lines.append(f"- {f}")
|
| 428 |
+
|
| 429 |
+
if section.gaps:
|
| 430 |
+
lines.append("")
|
| 431 |
+
lines.append("**Knowledge Gaps:**")
|
| 432 |
+
for gap in section.gaps:
|
| 433 |
+
gap_text = gap.get("text", gap) if isinstance(gap, dict) else gap
|
| 434 |
+
lines.append(f"- [ ] {gap_text}")
|
| 435 |
+
|
| 436 |
+
lines.append("")
|
| 437 |
+
|
| 438 |
+
if section.sources:
|
| 439 |
+
lines.append(f"<details><summary>Sources ({len(section.sources)} URLs)</summary>")
|
| 440 |
+
lines.append("")
|
| 441 |
+
for url in section.sources[:10]:
|
| 442 |
+
lines.append(f"- {url}")
|
| 443 |
+
if len(section.sources) > 10:
|
| 444 |
+
lines.append(f"- ... and {len(section.sources) - 10} more")
|
| 445 |
+
lines.append("")
|
| 446 |
+
lines.append("</details>")
|
| 447 |
+
lines.append("")
|
| 448 |
+
|
| 449 |
+
lines.extend(["---", "", "## Summary", ""])
|
| 450 |
+
|
| 451 |
+
total_findings = sum(len(s.findings) for s in self.sections.values())
|
| 452 |
+
confirmed = sum(
|
| 453 |
+
sum(1 for f in s.findings if isinstance(f, dict) and f.get("confirmed"))
|
| 454 |
+
for s in self.sections.values()
|
| 455 |
+
)
|
| 456 |
+
speculative = total_findings - confirmed
|
| 457 |
+
|
| 458 |
+
lines.append(f"| Metric | Count |")
|
| 459 |
+
lines.append(f"|--------|-------|")
|
| 460 |
+
lines.append(f"| Total findings | {total_findings} |")
|
| 461 |
+
lines.append(f"| Confirmed | {confirmed} |")
|
| 462 |
+
lines.append(f"| Speculative | {speculative} |")
|
| 463 |
+
lines.append(f"| Categories | {len(self.sections)} |")
|
| 464 |
+
lines.append("")
|
| 465 |
+
|
| 466 |
+
all_gaps = []
|
| 467 |
+
for s in self.sections.values():
|
| 468 |
+
for g in s.gaps:
|
| 469 |
+
gap_text = g.get("text", g) if isinstance(g, dict) else g
|
| 470 |
+
all_gaps.append(f"{s.label}: {gap_text}")
|
| 471 |
+
|
| 472 |
+
if all_gaps:
|
| 473 |
+
lines.append("### Outstanding Gaps")
|
| 474 |
+
lines.append("")
|
| 475 |
+
for gap in all_gaps:
|
| 476 |
+
lines.append(f"- [ ] {gap}")
|
| 477 |
+
lines.append("")
|
| 478 |
+
|
| 479 |
+
with open(report_path, "w") as f:
|
| 480 |
+
f.write("\n".join(lines))
|
| 481 |
+
|
| 482 |
+
return report_path
|
| 483 |
+
|
| 484 |
+
def _save_data(self):
|
| 485 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 486 |
+
slug = self.company.lower().replace(" ", "-").replace("/", "-")
|
| 487 |
+
slug = "".join(c for c in slug if c.isalnum() or c == "-")
|
| 488 |
+
|
| 489 |
+
data = {
|
| 490 |
+
"company": self.company,
|
| 491 |
+
"generated": datetime.now().isoformat(),
|
| 492 |
+
"sections": {},
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
for cat_key, section in self.sections.items():
|
| 496 |
+
data["sections"][cat_key] = {
|
| 497 |
+
"label": section.label,
|
| 498 |
+
"queries_executed": section.queries_executed,
|
| 499 |
+
"finding_count": len(section.findings),
|
| 500 |
+
"findings": section.findings,
|
| 501 |
+
"gaps": section.gaps,
|
| 502 |
+
"source_count": len(section.sources),
|
| 503 |
+
"sources": section.sources[:20],
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
json_path = self.output_dir / f"{timestamp}_{slug}_intel.json"
|
| 507 |
+
with open(json_path, "w") as f:
|
| 508 |
+
json.dump(data, f, indent=2)
|
llm.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenRouter LLM client for research analysis and synthesis.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import re
|
| 9 |
+
import sys
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
from .config import OPENROUTER_API_KEY, LLM_MODEL
|
| 14 |
+
except ImportError:
|
| 15 |
+
from config import OPENROUTER_API_KEY, LLM_MODEL
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class LLMClient:
|
| 19 |
+
"""OpenRouter LLM client for research analysis."""
|
| 20 |
+
|
| 21 |
+
def __init__(self, api_key: str = None, model: str = LLM_MODEL):
|
| 22 |
+
self.api_key = api_key or OPENROUTER_API_KEY
|
| 23 |
+
self.model = model
|
| 24 |
+
self.enabled = bool(self.api_key)
|
| 25 |
+
|
| 26 |
+
def call(self, prompt: str, system: str = None, max_tokens: int = 1000) -> Optional[str]:
|
| 27 |
+
if not self.enabled:
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
import requests
|
| 31 |
+
|
| 32 |
+
messages = []
|
| 33 |
+
if system:
|
| 34 |
+
messages.append({"role": "system", "content": system})
|
| 35 |
+
messages.append({"role": "user", "content": prompt})
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
response = requests.post(
|
| 39 |
+
"https://openrouter.ai/api/v1/chat/completions",
|
| 40 |
+
headers={
|
| 41 |
+
"Authorization": f"Bearer {self.api_key}",
|
| 42 |
+
"Content-Type": "application/json",
|
| 43 |
+
},
|
| 44 |
+
json={
|
| 45 |
+
"model": self.model,
|
| 46 |
+
"messages": messages,
|
| 47 |
+
"max_tokens": max_tokens,
|
| 48 |
+
"temperature": 0.3,
|
| 49 |
+
},
|
| 50 |
+
timeout=60
|
| 51 |
+
)
|
| 52 |
+
response.raise_for_status()
|
| 53 |
+
return response.json()["choices"][0]["message"]["content"]
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"[LLM ERROR] {e}", file=sys.stderr)
|
| 56 |
+
return None
|
| 57 |
+
|
| 58 |
+
def generate_category_queries(self, company: str, category_label: str) -> list[str]:
|
| 59 |
+
"""Generate additional search queries for a specific intel category."""
|
| 60 |
+
system = (
|
| 61 |
+
"You are a competitive intelligence analyst specializing in "
|
| 62 |
+
"rehabilitation robotics and medical devices. "
|
| 63 |
+
"Generate specific, targeted web search queries. "
|
| 64 |
+
"Return ONLY a JSON array of query strings. "
|
| 65 |
+
"Focus on recent sources (last 18 months). Prioritize primary sources."
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
prompt = f"""Company: {company}
|
| 69 |
+
Category: {category_label}
|
| 70 |
+
|
| 71 |
+
Generate 3-4 additional specific search queries for deep competitive intelligence on this company in this category.
|
| 72 |
+
Focus on primary sources: company blog, official announcements, SEC filings, patent databases, verified review sites, job postings.
|
| 73 |
+
Return as JSON array: ["query1", "query2", ...]"""
|
| 74 |
+
|
| 75 |
+
response = self.call(prompt, system)
|
| 76 |
+
if response:
|
| 77 |
+
match = re.search(r'\[.*\]', response, re.DOTALL)
|
| 78 |
+
if match:
|
| 79 |
+
try:
|
| 80 |
+
return json.loads(match.group())[:4]
|
| 81 |
+
except Exception:
|
| 82 |
+
pass
|
| 83 |
+
return []
|
| 84 |
+
|
| 85 |
+
def synthesize_intel(self, company: str, category_key: str,
|
| 86 |
+
category_label: str, results: list,
|
| 87 |
+
synthesis_questions: dict = None) -> dict:
|
| 88 |
+
"""Synthesize search results into structured intelligence.
|
| 89 |
+
|
| 90 |
+
Uses per-category questions to produce distilled, actionable findings.
|
| 91 |
+
Returns dict with 'findings' and 'gaps'.
|
| 92 |
+
"""
|
| 93 |
+
try:
|
| 94 |
+
from .intel import CATEGORY_SYNTHESIS_QUESTIONS
|
| 95 |
+
except ImportError:
|
| 96 |
+
from intel import CATEGORY_SYNTHESIS_QUESTIONS
|
| 97 |
+
|
| 98 |
+
questions = (synthesis_questions or CATEGORY_SYNTHESIS_QUESTIONS).get(category_key, [])
|
| 99 |
+
if not questions:
|
| 100 |
+
return {"findings": [], "gaps": []}
|
| 101 |
+
|
| 102 |
+
questions_text = "\n".join(f"- {q}" for q in questions)
|
| 103 |
+
|
| 104 |
+
results_text = "\n".join([
|
| 105 |
+
f"- [{r.source}] {r.title}\n {r.snippet[:300]}\n URL: {r.url}"
|
| 106 |
+
for r in results[:15]
|
| 107 |
+
])
|
| 108 |
+
|
| 109 |
+
system = (
|
| 110 |
+
"You are a competitive intelligence analyst for GURMA.ai, "
|
| 111 |
+
"a Swiss AI company entering rehabilitation robotics with "
|
| 112 |
+
"15 years of patient outcome data from BAMA Teknoloji. "
|
| 113 |
+
"Synthesize search results into actionable intelligence. "
|
| 114 |
+
"Recent sources only (last 18 months). "
|
| 115 |
+
"Flag speculation vs confirmed facts. Include URLs."
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
prompt = f"""Conduct deep competitive intelligence on {company}.
|
| 119 |
+
Category: {category_label}
|
| 120 |
+
|
| 121 |
+
Answer these specific questions based on the search results:
|
| 122 |
+
{questions_text}
|
| 123 |
+
|
| 124 |
+
Search results:
|
| 125 |
+
{results_text}
|
| 126 |
+
|
| 127 |
+
Return JSON:
|
| 128 |
+
{{
|
| 129 |
+
"findings": [
|
| 130 |
+
{{"text": "synthesized answer to one of the questions", "confirmed": true, "source": "url"}},
|
| 131 |
+
{{"text": "inferred insight", "confirmed": false, "source": "url or empty"}}
|
| 132 |
+
],
|
| 133 |
+
"gaps": [
|
| 134 |
+
{{"text": "question that could NOT be answered from search results"}}
|
| 135 |
+
]
|
| 136 |
+
}}
|
| 137 |
+
|
| 138 |
+
Rules:
|
| 139 |
+
- confirmed=true ONLY for facts from primary sources (company website, SEC filings, press releases)
|
| 140 |
+
- confirmed=false for inferred or secondary-source information
|
| 141 |
+
- Each finding should directly answer one of the questions above
|
| 142 |
+
- Be specific and quantitative where possible
|
| 143 |
+
- If a question cannot be answered, add it to gaps
|
| 144 |
+
- Maximum 12 findings"""
|
| 145 |
+
|
| 146 |
+
response = self.call(prompt, system, max_tokens=2000)
|
| 147 |
+
if response:
|
| 148 |
+
match = re.search(r'\{.*\}', response, re.DOTALL)
|
| 149 |
+
if match:
|
| 150 |
+
try:
|
| 151 |
+
return json.loads(match.group())
|
| 152 |
+
except Exception:
|
| 153 |
+
pass
|
| 154 |
+
return {"findings": [], "gaps": []}
|
research.py
CHANGED
|
@@ -1,1922 +1,70 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
GURMA.ai Research Tool
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
# Deep competitive intelligence on a single company
|
| 19 |
-
python research.py competitor "Ekso Bionics"
|
| 20 |
-
python research.py competitor "Fourier Intelligence" --external-llm
|
| 21 |
-
python research.py competitor "Cyberdyne" -c company_overview,product_deep_dive
|
| 22 |
-
python research.py competitor --list-categories
|
| 23 |
-
|
| 24 |
-
# Extract to competitors.json (for dashboard)
|
| 25 |
-
python research.py extract
|
| 26 |
-
|
| 27 |
-
# List saved results
|
| 28 |
-
python research.py list
|
| 29 |
"""
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
import
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
import time
|
| 39 |
-
from abc import ABC, abstractmethod
|
| 40 |
-
from dataclasses import dataclass, field, asdict
|
| 41 |
-
from datetime import datetime, timedelta
|
| 42 |
-
from pathlib import Path
|
| 43 |
-
from typing import Optional, Protocol
|
| 44 |
-
|
| 45 |
-
# ============================================================
|
| 46 |
-
# Configuration
|
| 47 |
-
# ============================================================
|
| 48 |
-
|
| 49 |
-
# Detect environment: HF Space (Docker at /app) vs local development
|
| 50 |
-
def _detect_project_root() -> Path:
|
| 51 |
-
"""Detect project root based on environment."""
|
| 52 |
-
# HF Spaces: running from /app with research.py in root
|
| 53 |
-
if os.getenv("HF_SPACE") or Path("/app/research.py").exists():
|
| 54 |
-
return Path("/app")
|
| 55 |
-
# Local: research.py is in src/utils/
|
| 56 |
-
return Path(__file__).parent.parent.parent
|
| 57 |
-
|
| 58 |
-
PROJECT_ROOT = _detect_project_root()
|
| 59 |
-
IS_HF_SPACE = PROJECT_ROOT == Path("/app")
|
| 60 |
-
|
| 61 |
-
# Load .env if present (local development)
|
| 62 |
-
if not IS_HF_SPACE:
|
| 63 |
-
try:
|
| 64 |
-
from dotenv import load_dotenv
|
| 65 |
-
load_dotenv(PROJECT_ROOT / ".env")
|
| 66 |
-
except ImportError:
|
| 67 |
-
pass
|
| 68 |
-
|
| 69 |
-
# Directories - different structure for HF Space vs local
|
| 70 |
-
if IS_HF_SPACE:
|
| 71 |
-
RESEARCH_DIR = PROJECT_ROOT / "data"
|
| 72 |
-
DATA_DIR = PROJECT_ROOT / "data"
|
| 73 |
-
else:
|
| 74 |
-
RESEARCH_DIR = PROJECT_ROOT / "data"
|
| 75 |
-
DATA_DIR = PROJECT_ROOT / "src" / "dashboard"
|
| 76 |
-
|
| 77 |
-
# Ensure directories exist
|
| 78 |
-
RESEARCH_DIR.mkdir(parents=True, exist_ok=True)
|
| 79 |
-
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 80 |
-
|
| 81 |
-
# API Keys
|
| 82 |
-
SERPAPI_KEY = os.getenv("SERPAPI_KEY")
|
| 83 |
-
BRAVE_API_KEY = os.getenv("BRAVE_API_KEY")
|
| 84 |
-
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
| 85 |
-
|
| 86 |
-
# LLM Config
|
| 87 |
-
LLM_MODEL = "deepseek/deepseek-chat"
|
| 88 |
-
LLM_ENABLED = bool(OPENROUTER_API_KEY)
|
| 89 |
-
|
| 90 |
-
# Known competitors for batch research
|
| 91 |
-
COMPETITORS = [
|
| 92 |
-
"Hocoma", "Ekso Bionics", "Lifeward ReWalk", "Fourier Intelligence",
|
| 93 |
-
"Cyberdyne HAL", "Wandercraft", "Myomo", "Bionik",
|
| 94 |
-
]
|
| 95 |
-
|
| 96 |
-
# Query templates for batch research
|
| 97 |
-
BATCH_QUERY_TEMPLATES = [
|
| 98 |
-
"{company} latest news 2025 2026",
|
| 99 |
-
"{company} funding investors valuation",
|
| 100 |
-
"{company} FDA approval regulatory",
|
| 101 |
-
"{company} partnerships collaborations",
|
| 102 |
-
"{company} AI machine learning technology",
|
| 103 |
-
# Targeted regulatory sources
|
| 104 |
-
"site:accessdata.fda.gov {company}", # FDA 510(k) clearances
|
| 105 |
-
"site:clinicaltrials.gov {company} rehabilitation", # Clinical trials
|
| 106 |
-
# Funding & corporate
|
| 107 |
-
"site:crunchbase.com {company}", # Funding history
|
| 108 |
-
"site:sec.gov {company} 10-K OR 8-K", # SEC filings (public companies)
|
| 109 |
-
# Patents & innovation
|
| 110 |
-
"site:patents.google.com {company} exoskeleton OR rehabilitation",
|
| 111 |
-
]
|
| 112 |
-
|
| 113 |
-
MARKET_QUERIES = [
|
| 114 |
-
"rehabilitation robotics market size 2026 forecast",
|
| 115 |
-
"exoskeleton market growth AI integration",
|
| 116 |
-
"rehabilitation robotics insurance reimbursement",
|
| 117 |
-
"medical exoskeleton FDA approval 2025",
|
| 118 |
-
"stroke rehabilitation AI technology",
|
| 119 |
-
"spinal cord injury exoskeleton treatment",
|
| 120 |
-
"rehabilitation robotics competitive landscape",
|
| 121 |
-
# Industry publications
|
| 122 |
-
"site:exoskeletonreport.com 2025 2026", # Industry news
|
| 123 |
-
"site:medgadget.com exoskeleton rehabilitation", # Med-tech news
|
| 124 |
-
# Regulatory landscape
|
| 125 |
-
"site:fda.gov rehabilitation robotics guidance",
|
| 126 |
-
"MDR medical device regulation exoskeleton CE mark 2025",
|
| 127 |
-
# Academic/clinical
|
| 128 |
-
"site:pubmed.ncbi.nlm.nih.gov rehabilitation robotics AI 2024 2025",
|
| 129 |
-
# Insurance/reimbursement (key for Holland market)
|
| 130 |
-
"exoskeleton insurance coverage CMS reimbursement code",
|
| 131 |
-
"rehabilitation robotics HCPCS code billing",
|
| 132 |
-
]
|
| 133 |
-
|
| 134 |
-
# Deep competitive intelligence query templates by category
|
| 135 |
-
# Tailored for rehabilitation robotics / medical device companies (~25 queries)
|
| 136 |
-
DEEP_INTEL_CATEGORIES = {
|
| 137 |
-
"company_overview": {
|
| 138 |
-
"label": "Company Overview",
|
| 139 |
-
"queries": [
|
| 140 |
-
"{company} founding history milestones",
|
| 141 |
-
"{company} CEO leadership team background",
|
| 142 |
-
"{company} funding rounds investors valuation",
|
| 143 |
-
"{company} employee count headcount growth",
|
| 144 |
-
],
|
| 145 |
-
},
|
| 146 |
-
"product_technology": {
|
| 147 |
-
"label": "Product & Technology",
|
| 148 |
-
"queries": [
|
| 149 |
-
"{company} exoskeleton rehabilitation robot product specifications",
|
| 150 |
-
"{company} AI machine learning technology capabilities",
|
| 151 |
-
"{company} new product launch release 2025 2026",
|
| 152 |
-
"{company} patent filings exoskeleton rehabilitation innovation",
|
| 153 |
-
"site:patents.google.com {company} exoskeleton OR rehabilitation",
|
| 154 |
-
],
|
| 155 |
-
},
|
| 156 |
-
"regulatory_clinical": {
|
| 157 |
-
"label": "Regulatory & Clinical",
|
| 158 |
-
"queries": [
|
| 159 |
-
"site:accessdata.fda.gov {company}",
|
| 160 |
-
"{company} FDA 510k clearance CE mark MDR approval",
|
| 161 |
-
"site:clinicaltrials.gov {company} rehabilitation",
|
| 162 |
-
"{company} clinical outcomes study peer-reviewed results",
|
| 163 |
-
],
|
| 164 |
-
},
|
| 165 |
-
"market_channels": {
|
| 166 |
-
"label": "Market & Channels",
|
| 167 |
-
"queries": [
|
| 168 |
-
"{company} hospital clinic installations customer base",
|
| 169 |
-
"{company} insurance reimbursement coverage CMS",
|
| 170 |
-
"{company} partnerships distributors resellers",
|
| 171 |
-
"{company} conference MEDICA ACRM CES 2025 2026",
|
| 172 |
-
],
|
| 173 |
-
},
|
| 174 |
-
"vulnerabilities_threats": {
|
| 175 |
-
"label": "Vulnerabilities & Threats",
|
| 176 |
-
"queries": [
|
| 177 |
-
"{company} weaknesses problems criticism recall",
|
| 178 |
-
"{company} layoffs restructuring financial difficulty",
|
| 179 |
-
"{company} Glassdoor employee reviews satisfaction",
|
| 180 |
-
"{company} rehabilitation robotics AI expansion strategy 2025 2026",
|
| 181 |
-
"{company} acquisitions mergers market share growth",
|
| 182 |
-
"site:sec.gov {company} 10-K OR 8-K",
|
| 183 |
-
],
|
| 184 |
-
},
|
| 185 |
-
}
|
| 186 |
-
|
| 187 |
-
# Primary/authoritative source domains for confirmed vs speculative scoring
|
| 188 |
-
PRIMARY_SOURCE_DOMAINS = {
|
| 189 |
-
# Regulatory / Official
|
| 190 |
-
"sec.gov", "fda.gov", "clinicaltrials.gov", "patents.google.com",
|
| 191 |
-
"accessdata.fda.gov",
|
| 192 |
-
# Financial / Business data
|
| 193 |
-
"crunchbase.com", "tracxn.com", "pitchbook.com", "cbinsights.com",
|
| 194 |
-
"bloomberg.com", "reuters.com", "wsj.com", "finance.yahoo.com",
|
| 195 |
-
"wellfound.com",
|
| 196 |
-
# Professional
|
| 197 |
-
"linkedin.com", "glassdoor.com",
|
| 198 |
-
# Review platforms
|
| 199 |
-
"g2.com", "capterra.com", "trustpilot.com",
|
| 200 |
-
# Industry-specific
|
| 201 |
-
"therobotreport.com", "exoskeletonreport.com", "medgadget.com",
|
| 202 |
-
}
|
| 203 |
-
|
| 204 |
-
# Expected data points per category — used for automatic gap detection.
|
| 205 |
-
# Each field maps to regex patterns that indicate coverage in result text.
|
| 206 |
-
CATEGORY_EXPECTED = {
|
| 207 |
-
"company_overview": {
|
| 208 |
-
"founding_year": [r'(?:founded|established|incorporated|started)\s+(?:in\s+)?(\d{4})'],
|
| 209 |
-
"leadership": [r'(?:CEO|Chief Executive|CTO|CFO|President|Founder|Chairman|COO)'],
|
| 210 |
-
"funding": [r'\$[\d,.]+\s*(?:million|billion|M|B)', r'(?:series\s+[A-F]|seed|IPO|funding\s+round)'],
|
| 211 |
-
"employees": [r'(\d[\d,]*)\s*(?:employees|staff|headcount|team\s+members|workers)'],
|
| 212 |
-
},
|
| 213 |
-
"product_technology": {
|
| 214 |
-
"products": [r'(?:product|device|robot|exoskeleton|system)\s'],
|
| 215 |
-
"technology": [r'(?:AI|machine\s+learning|deep\s+learning|sensor|actuator|algorithm|neural)'],
|
| 216 |
-
"patents": [r'(?:patent|IP|intellectual\s+property|invention)'],
|
| 217 |
-
"recent_launches": [r'(?:launch|release|announc|unveil|introduc)\w*\s+.{0,30}(?:2025|2026)'],
|
| 218 |
-
},
|
| 219 |
-
"regulatory_clinical": {
|
| 220 |
-
"fda_clearance": [r'(?:510\(?k\)?|FDA.?clear|FDA.?approv|de\s*novo)'],
|
| 221 |
-
"ce_mark": [r'(?:CE.?mark|MDR|EU.?approv|notified.?body)'],
|
| 222 |
-
"clinical_trials": [r'(?:clinical.?trial|NCT\d|randomized|controlled.?study|peer.?review)'],
|
| 223 |
-
"clinical_outcomes": [r'(?:outcome|efficacy|recovery.?rate|improvement|functional.?score)'],
|
| 224 |
-
},
|
| 225 |
-
"market_channels": {
|
| 226 |
-
"installations": [r'(?:hospital|clinic|center|install|deploy|site)\s'],
|
| 227 |
-
"reimbursement": [r'(?:reimburse|insurance|CMS|Medicare|Medicaid|HCPCS|coverage|payer)'],
|
| 228 |
-
"partnerships": [r'(?:partner|alliance|collaborat|distribut|reseller|dealer)'],
|
| 229 |
-
"events": [r'(?:conference|MEDICA|ACRM|CES|expo|trade\s+show|summit)'],
|
| 230 |
-
},
|
| 231 |
-
"vulnerabilities_threats": {
|
| 232 |
-
"weaknesses": [r'(?:weakness|problem|challenge|struggle|fail|recall|warning)'],
|
| 233 |
-
"financial_stress": [r'(?:layoff|restructur|loss|declining|debt|delist|penny.stock)'],
|
| 234 |
-
"employee_sentiment": [r'(?:glassdoor|employee.?review|work.?culture|turnover)'],
|
| 235 |
-
"expansion": [r'(?:expansion|new.?market|acqui|merger|market.?share|growth.?strategy)'],
|
| 236 |
-
},
|
| 237 |
-
}
|
| 238 |
-
|
| 239 |
-
# Per-category synthesis questions — the LLM answers these from search results.
|
| 240 |
-
# Tailored for rehabilitation robotics / medical device competitors.
|
| 241 |
-
CATEGORY_SYNTHESIS_QUESTIONS = {
|
| 242 |
-
"company_overview": [
|
| 243 |
-
"Founding story and key milestones",
|
| 244 |
-
"Leadership team (backgrounds, medical device experience)",
|
| 245 |
-
"Funding history (rounds, investors, valuations)",
|
| 246 |
-
"Employee count and growth trajectory",
|
| 247 |
-
],
|
| 248 |
-
"product_technology": [
|
| 249 |
-
"Product catalog (devices, indications, patient populations)",
|
| 250 |
-
"AI / machine learning capabilities (data they train on, algorithms used)",
|
| 251 |
-
"Recent product launches and roadmap clues (last 12 months)",
|
| 252 |
-
"Patent portfolio and innovation direction",
|
| 253 |
-
"How does their technology compare to GURMA.ai's outcome-data approach?",
|
| 254 |
-
],
|
| 255 |
-
"regulatory_clinical": [
|
| 256 |
-
"FDA clearances (510(k) numbers, De Novo, dates)",
|
| 257 |
-
"CE mark / MDR status in Europe",
|
| 258 |
-
"Active clinical trials (ClinicalTrials.gov entries, endpoints)",
|
| 259 |
-
"Published clinical outcomes (peer-reviewed studies, recovery rates)",
|
| 260 |
-
"Reimbursement status (CMS, Medicare, private payer coverage)",
|
| 261 |
-
],
|
| 262 |
-
"market_channels": [
|
| 263 |
-
"Hospital and clinic installations (how many sites, which countries)",
|
| 264 |
-
"Insurance and reimbursement strategy (pricing, payer relationships)",
|
| 265 |
-
"Distribution partnerships and reseller network",
|
| 266 |
-
"Conference and KOL presence (MEDICA, ACRM, physician endorsements)",
|
| 267 |
-
],
|
| 268 |
-
"vulnerabilities_threats": [
|
| 269 |
-
"What are they bad at? (clinical limitations, missing indications)",
|
| 270 |
-
"Financial health (SEC filings, cash burn, stock trajectory)",
|
| 271 |
-
"Employee sentiment (Glassdoor, hiring patterns, layoffs)",
|
| 272 |
-
"Growth strategy (acquisitions, new markets, AI investments)",
|
| 273 |
-
"What could they do that would hurt GURMA.ai most?",
|
| 274 |
-
"Early warning signals to monitor",
|
| 275 |
-
],
|
| 276 |
-
}
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
# ============================================================
|
| 280 |
-
# Search Backends (Open/Closed Principle)
|
| 281 |
-
# ============================================================
|
| 282 |
-
|
| 283 |
-
class SearchResult(Protocol):
|
| 284 |
-
"""Protocol for search result."""
|
| 285 |
-
title: str
|
| 286 |
-
url: str
|
| 287 |
-
snippet: str
|
| 288 |
-
source: str
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
@dataclass
|
| 292 |
-
class WebSearchResult:
|
| 293 |
-
"""Standard search result."""
|
| 294 |
-
title: str
|
| 295 |
-
url: str
|
| 296 |
-
snippet: str
|
| 297 |
-
source: str
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
class SearchBackend(ABC):
|
| 301 |
-
"""Abstract base for search backends (Liskov Substitution)."""
|
| 302 |
-
|
| 303 |
-
@property
|
| 304 |
-
@abstractmethod
|
| 305 |
-
def name(self) -> str:
|
| 306 |
-
"""Backend identifier."""
|
| 307 |
-
pass
|
| 308 |
-
|
| 309 |
-
@abstractmethod
|
| 310 |
-
def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
|
| 311 |
-
"""Execute search and return results."""
|
| 312 |
-
pass
|
| 313 |
-
|
| 314 |
-
@abstractmethod
|
| 315 |
-
def is_available(self) -> bool:
|
| 316 |
-
"""Check if backend is available (dependencies, API keys)."""
|
| 317 |
-
pass
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
class DuckDuckGoBackend(SearchBackend):
|
| 321 |
-
"""DuckDuckGo search (no API key required)."""
|
| 322 |
-
|
| 323 |
-
@property
|
| 324 |
-
def name(self) -> str:
|
| 325 |
-
return "duckduckgo"
|
| 326 |
-
|
| 327 |
-
def is_available(self) -> bool:
|
| 328 |
-
try:
|
| 329 |
-
from ddgs import DDGS
|
| 330 |
-
return True
|
| 331 |
-
except ImportError:
|
| 332 |
-
try:
|
| 333 |
-
from duckduckgo_search import DDGS
|
| 334 |
-
return True
|
| 335 |
-
except ImportError:
|
| 336 |
-
return False
|
| 337 |
-
|
| 338 |
-
def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
|
| 339 |
-
try:
|
| 340 |
-
from ddgs import DDGS
|
| 341 |
-
except ImportError:
|
| 342 |
-
from duckduckgo_search import DDGS
|
| 343 |
-
|
| 344 |
-
results = []
|
| 345 |
-
ddgs = DDGS()
|
| 346 |
-
for r in ddgs.text(query, max_results=max_results):
|
| 347 |
-
results.append(WebSearchResult(
|
| 348 |
-
title=r.get("title", ""),
|
| 349 |
-
url=r.get("href", r.get("link", "")),
|
| 350 |
-
snippet=r.get("body", r.get("snippet", "")),
|
| 351 |
-
source=self.name
|
| 352 |
-
))
|
| 353 |
-
return results
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
class SerpAPIBackend(SearchBackend):
|
| 357 |
-
"""SerpAPI search (requires API key)."""
|
| 358 |
-
|
| 359 |
-
@property
|
| 360 |
-
def name(self) -> str:
|
| 361 |
-
return "serpapi"
|
| 362 |
-
|
| 363 |
-
def is_available(self) -> bool:
|
| 364 |
-
try:
|
| 365 |
-
import requests
|
| 366 |
-
return bool(SERPAPI_KEY)
|
| 367 |
-
except ImportError:
|
| 368 |
-
return False
|
| 369 |
-
|
| 370 |
-
def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
|
| 371 |
-
import requests
|
| 372 |
-
|
| 373 |
-
response = requests.get(
|
| 374 |
-
"https://serpapi.com/search",
|
| 375 |
-
params={"q": query, "api_key": SERPAPI_KEY, "engine": "google", "num": max_results},
|
| 376 |
-
timeout=30
|
| 377 |
-
)
|
| 378 |
-
response.raise_for_status()
|
| 379 |
-
data = response.json()
|
| 380 |
-
|
| 381 |
-
results = []
|
| 382 |
-
for r in data.get("organic_results", [])[:max_results]:
|
| 383 |
-
results.append(WebSearchResult(
|
| 384 |
-
title=r.get("title", ""),
|
| 385 |
-
url=r.get("link", ""),
|
| 386 |
-
snippet=r.get("snippet", ""),
|
| 387 |
-
source=self.name
|
| 388 |
-
))
|
| 389 |
-
return results
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
class BraveBackend(SearchBackend):
|
| 393 |
-
"""Brave search (requires API key)."""
|
| 394 |
-
|
| 395 |
-
@property
|
| 396 |
-
def name(self) -> str:
|
| 397 |
-
return "brave"
|
| 398 |
-
|
| 399 |
-
def is_available(self) -> bool:
|
| 400 |
-
try:
|
| 401 |
-
import requests
|
| 402 |
-
return bool(BRAVE_API_KEY)
|
| 403 |
-
except ImportError:
|
| 404 |
-
return False
|
| 405 |
-
|
| 406 |
-
def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
|
| 407 |
-
import requests
|
| 408 |
-
|
| 409 |
-
response = requests.get(
|
| 410 |
-
"https://api.search.brave.com/res/v1/web/search",
|
| 411 |
-
headers={"Accept": "application/json", "X-Subscription-Token": BRAVE_API_KEY},
|
| 412 |
-
params={"q": query, "count": min(max_results, 20)},
|
| 413 |
-
timeout=30
|
| 414 |
-
)
|
| 415 |
-
response.raise_for_status()
|
| 416 |
-
data = response.json()
|
| 417 |
-
|
| 418 |
-
results = []
|
| 419 |
-
for r in data.get("web", {}).get("results", [])[:max_results]:
|
| 420 |
-
results.append(WebSearchResult(
|
| 421 |
-
title=r.get("title", ""),
|
| 422 |
-
url=r.get("url", ""),
|
| 423 |
-
snippet=r.get("description", ""),
|
| 424 |
-
source=self.name
|
| 425 |
-
))
|
| 426 |
-
return results
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
# Backend registry
|
| 430 |
-
BACKENDS: dict[str, SearchBackend] = {
|
| 431 |
-
"duckduckgo": DuckDuckGoBackend(),
|
| 432 |
-
"ddg": DuckDuckGoBackend(),
|
| 433 |
-
"serpapi": SerpAPIBackend(),
|
| 434 |
-
"brave": BraveBackend(),
|
| 435 |
-
}
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
def get_backend(name: str = "duckduckgo") -> SearchBackend:
|
| 439 |
-
"""Get search backend by name."""
|
| 440 |
-
backend = BACKENDS.get(name)
|
| 441 |
-
if not backend:
|
| 442 |
-
raise ValueError(f"Unknown backend: {name}. Available: {list(BACKENDS.keys())}")
|
| 443 |
-
if not backend.is_available():
|
| 444 |
-
raise RuntimeError(f"Backend '{name}' not available. Check dependencies/API keys.")
|
| 445 |
-
return backend
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
# ============================================================
|
| 449 |
-
# Result Storage (Single Responsibility)
|
| 450 |
-
# ============================================================
|
| 451 |
-
|
| 452 |
-
class ResultStorage:
|
| 453 |
-
"""Handles saving and loading search results."""
|
| 454 |
-
|
| 455 |
-
def __init__(self, directory: Path = RESEARCH_DIR):
|
| 456 |
-
self.directory = directory
|
| 457 |
-
self.directory.mkdir(parents=True, exist_ok=True)
|
| 458 |
-
|
| 459 |
-
def save(self, query: str, results: list[WebSearchResult], backend: str) -> tuple[Path, Path]:
|
| 460 |
-
"""Save results in JSON and Markdown formats. Returns (json_path, md_path)."""
|
| 461 |
-
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 462 |
-
slug = self._slugify(query)
|
| 463 |
-
base_name = f"{timestamp}_{slug}"
|
| 464 |
-
|
| 465 |
-
# Build data
|
| 466 |
-
data = {
|
| 467 |
-
"query": query,
|
| 468 |
-
"timestamp": datetime.now().isoformat(),
|
| 469 |
-
"backend": backend,
|
| 470 |
-
"result_count": len(results),
|
| 471 |
-
"results": [asdict(r) for r in results]
|
| 472 |
-
}
|
| 473 |
-
|
| 474 |
-
# Save JSON
|
| 475 |
-
json_path = self.directory / f"{base_name}.json"
|
| 476 |
-
with open(json_path, "w") as f:
|
| 477 |
-
json.dump(data, f, indent=2)
|
| 478 |
-
|
| 479 |
-
# Save Markdown
|
| 480 |
-
md_path = self.directory / f"{base_name}.md"
|
| 481 |
-
with open(md_path, "w") as f:
|
| 482 |
-
f.write(f"# Search: {query}\n\n")
|
| 483 |
-
f.write(f"**Date:** {data['timestamp']} \n")
|
| 484 |
-
f.write(f"**Backend:** {backend} \n")
|
| 485 |
-
f.write(f"**Results:** {len(results)}\n\n---\n")
|
| 486 |
-
for i, r in enumerate(results, 1):
|
| 487 |
-
f.write(f"\n## {i}. {r.title}\n\n**URL:** {r.url}\n\n{r.snippet}\n")
|
| 488 |
-
|
| 489 |
-
return json_path, md_path
|
| 490 |
-
|
| 491 |
-
def list_searches(self, limit: int = 20) -> list[dict]:
|
| 492 |
-
"""List recent saved searches."""
|
| 493 |
-
searches = []
|
| 494 |
-
for json_file in sorted(self.directory.glob("*.json"), reverse=True):
|
| 495 |
-
if json_file.name.startswith("."):
|
| 496 |
-
continue
|
| 497 |
-
try:
|
| 498 |
-
with open(json_file) as f:
|
| 499 |
-
data = json.load(f)
|
| 500 |
-
searches.append({
|
| 501 |
-
"file": json_file.name,
|
| 502 |
-
"query": data.get("query", ""),
|
| 503 |
-
"timestamp": data.get("timestamp", ""),
|
| 504 |
-
"results": data.get("result_count", 0)
|
| 505 |
-
})
|
| 506 |
-
except:
|
| 507 |
-
pass
|
| 508 |
-
if len(searches) >= limit:
|
| 509 |
-
break
|
| 510 |
-
return searches
|
| 511 |
-
|
| 512 |
-
def get_recent_queries(self, days: int = 7) -> set[str]:
|
| 513 |
-
"""Get queries executed within the last N days (normalized for deduplication)."""
|
| 514 |
-
cutoff = datetime.now() - timedelta(days=days)
|
| 515 |
-
recent = set()
|
| 516 |
-
|
| 517 |
-
for json_file in self.directory.glob("*.json"):
|
| 518 |
-
if json_file.name.startswith("."):
|
| 519 |
-
continue
|
| 520 |
-
try:
|
| 521 |
-
with open(json_file) as f:
|
| 522 |
-
data = json.load(f)
|
| 523 |
-
ts = data.get("timestamp", "")
|
| 524 |
-
if ts:
|
| 525 |
-
file_date = datetime.fromisoformat(ts.replace("Z", "+00:00").split("+")[0])
|
| 526 |
-
if file_date >= cutoff:
|
| 527 |
-
query = data.get("query", "").lower().strip()
|
| 528 |
-
recent.add(query)
|
| 529 |
-
except:
|
| 530 |
-
pass
|
| 531 |
-
return recent
|
| 532 |
-
|
| 533 |
-
def _slugify(self, text: str, max_len: int = 50) -> str:
|
| 534 |
-
"""Convert text to filesystem-safe slug."""
|
| 535 |
-
slug = text.lower()[:max_len].replace(" ", "-").replace("/", "-")
|
| 536 |
-
return "".join(c for c in slug if c.isalnum() or c == "-")
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
# ============================================================
|
| 540 |
-
# Search Service (Facade Pattern)
|
| 541 |
-
# ============================================================
|
| 542 |
-
|
| 543 |
-
class SearchService:
|
| 544 |
-
"""High-level search interface combining backend and storage."""
|
| 545 |
-
|
| 546 |
-
def __init__(self, backend: str = "duckduckgo", storage: ResultStorage = None):
|
| 547 |
-
self.backend = get_backend(backend)
|
| 548 |
-
self.storage = storage or ResultStorage()
|
| 549 |
-
|
| 550 |
-
def search(self, query: str, max_results: int = 10, save: bool = True) -> list[WebSearchResult]:
|
| 551 |
-
"""Execute search, optionally save results."""
|
| 552 |
-
results = self.backend.search(query, max_results)
|
| 553 |
-
if save and results:
|
| 554 |
-
self.storage.save(query, results, self.backend.name)
|
| 555 |
-
return results
|
| 556 |
-
|
| 557 |
-
def search_batch(self, queries: list[str], max_results: int = 10,
|
| 558 |
-
delay: float = 0.5, callback=None) -> dict[str, int]:
|
| 559 |
-
"""Execute multiple searches with rate limiting.
|
| 560 |
-
|
| 561 |
-
Returns dict of {query: result_count}.
|
| 562 |
-
"""
|
| 563 |
-
stats = {}
|
| 564 |
-
for i, query in enumerate(queries, 1):
|
| 565 |
-
if callback:
|
| 566 |
-
callback(i, len(queries), query)
|
| 567 |
-
try:
|
| 568 |
-
results = self.search(query, max_results, save=True)
|
| 569 |
-
stats[query] = len(results)
|
| 570 |
-
except Exception as e:
|
| 571 |
-
stats[query] = -1 # Error indicator
|
| 572 |
-
print(f"Error on '{query}': {e}", file=sys.stderr)
|
| 573 |
-
|
| 574 |
-
if delay > 0 and i < len(queries):
|
| 575 |
-
time.sleep(delay)
|
| 576 |
-
|
| 577 |
-
return stats
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
# ============================================================
|
| 581 |
-
# LLM Integration (Dependency Inversion)
|
| 582 |
-
# ============================================================
|
| 583 |
-
|
| 584 |
-
class LLMClient:
|
| 585 |
-
"""OpenRouter LLM client for research analysis."""
|
| 586 |
-
|
| 587 |
-
def __init__(self, api_key: str = None, model: str = LLM_MODEL):
|
| 588 |
-
self.api_key = api_key or OPENROUTER_API_KEY
|
| 589 |
-
self.model = model
|
| 590 |
-
self.enabled = bool(self.api_key)
|
| 591 |
-
|
| 592 |
-
def call(self, prompt: str, system: str = None, max_tokens: int = 1000) -> Optional[str]:
|
| 593 |
-
"""Make LLM API call. Returns response text or None."""
|
| 594 |
-
if not self.enabled:
|
| 595 |
-
return None
|
| 596 |
-
|
| 597 |
-
import requests
|
| 598 |
-
|
| 599 |
-
messages = []
|
| 600 |
-
if system:
|
| 601 |
-
messages.append({"role": "system", "content": system})
|
| 602 |
-
messages.append({"role": "user", "content": prompt})
|
| 603 |
-
|
| 604 |
-
try:
|
| 605 |
-
response = requests.post(
|
| 606 |
-
"https://openrouter.ai/api/v1/chat/completions",
|
| 607 |
-
headers={
|
| 608 |
-
"Authorization": f"Bearer {self.api_key}",
|
| 609 |
-
"Content-Type": "application/json",
|
| 610 |
-
},
|
| 611 |
-
json={
|
| 612 |
-
"model": self.model,
|
| 613 |
-
"messages": messages,
|
| 614 |
-
"max_tokens": max_tokens,
|
| 615 |
-
"temperature": 0.3,
|
| 616 |
-
},
|
| 617 |
-
timeout=60
|
| 618 |
-
)
|
| 619 |
-
response.raise_for_status()
|
| 620 |
-
return response.json()["choices"][0]["message"]["content"]
|
| 621 |
-
except Exception as e:
|
| 622 |
-
print(f"[LLM ERROR] {e}", file=sys.stderr)
|
| 623 |
-
return None
|
| 624 |
-
|
| 625 |
-
def generate_category_queries(self, company: str, category_label: str) -> list[str]:
|
| 626 |
-
"""Generate additional search queries for a specific intel category."""
|
| 627 |
-
system = (
|
| 628 |
-
"You are a competitive intelligence analyst specializing in "
|
| 629 |
-
"rehabilitation robotics and medical devices. "
|
| 630 |
-
"Generate specific, targeted web search queries. "
|
| 631 |
-
"Return ONLY a JSON array of query strings. "
|
| 632 |
-
"Focus on recent sources (last 18 months). Prioritize primary sources."
|
| 633 |
-
)
|
| 634 |
-
|
| 635 |
-
prompt = f"""Company: {company}
|
| 636 |
-
Category: {category_label}
|
| 637 |
-
|
| 638 |
-
Generate 3-4 additional specific search queries for deep competitive intelligence on this company in this category.
|
| 639 |
-
Focus on primary sources: company blog, official announcements, SEC filings, patent databases, verified review sites, job postings.
|
| 640 |
-
Return as JSON array: ["query1", "query2", ...]"""
|
| 641 |
-
|
| 642 |
-
response = self.call(prompt, system)
|
| 643 |
-
if response:
|
| 644 |
-
match = re.search(r'\[.*\]', response, re.DOTALL)
|
| 645 |
-
if match:
|
| 646 |
-
try:
|
| 647 |
-
return json.loads(match.group())[:4]
|
| 648 |
-
except Exception:
|
| 649 |
-
pass
|
| 650 |
-
return []
|
| 651 |
-
|
| 652 |
-
def synthesize_intel(self, company: str, category_key: str,
|
| 653 |
-
category_label: str, results: list) -> dict:
|
| 654 |
-
"""Synthesize search results into structured intelligence.
|
| 655 |
-
|
| 656 |
-
Uses per-category questions from CATEGORY_SYNTHESIS_QUESTIONS to
|
| 657 |
-
produce distilled, actionable findings instead of raw snippets.
|
| 658 |
-
Returns dict with 'findings' and 'gaps'.
|
| 659 |
-
"""
|
| 660 |
-
questions = CATEGORY_SYNTHESIS_QUESTIONS.get(category_key, [])
|
| 661 |
-
if not questions:
|
| 662 |
-
return {"findings": [], "gaps": []}
|
| 663 |
-
|
| 664 |
-
questions_text = "\n".join(f"- {q}" for q in questions)
|
| 665 |
-
|
| 666 |
-
results_text = "\n".join([
|
| 667 |
-
f"- [{r.source}] {r.title}\n {r.snippet[:300]}\n URL: {r.url}"
|
| 668 |
-
for r in results[:15]
|
| 669 |
-
])
|
| 670 |
-
|
| 671 |
-
system = (
|
| 672 |
-
"You are a competitive intelligence analyst for GURMA.ai, "
|
| 673 |
-
"a Swiss AI company entering rehabilitation robotics with "
|
| 674 |
-
"15 years of patient outcome data from BAMA Teknoloji. "
|
| 675 |
-
"Synthesize search results into actionable intelligence. "
|
| 676 |
-
"Recent sources only (last 18 months). "
|
| 677 |
-
"Flag speculation vs confirmed facts. Include URLs."
|
| 678 |
-
)
|
| 679 |
-
|
| 680 |
-
prompt = f"""Conduct deep competitive intelligence on {company}.
|
| 681 |
-
Category: {category_label}
|
| 682 |
-
|
| 683 |
-
Answer these specific questions based on the search results:
|
| 684 |
-
{questions_text}
|
| 685 |
-
|
| 686 |
-
Search results:
|
| 687 |
-
{results_text}
|
| 688 |
-
|
| 689 |
-
Return JSON:
|
| 690 |
-
{{
|
| 691 |
-
"findings": [
|
| 692 |
-
{{"text": "synthesized answer to one of the questions", "confirmed": true, "source": "url"}},
|
| 693 |
-
{{"text": "inferred insight", "confirmed": false, "source": "url or empty"}}
|
| 694 |
-
],
|
| 695 |
-
"gaps": [
|
| 696 |
-
{{"text": "question that could NOT be answered from search results"}}
|
| 697 |
-
]
|
| 698 |
-
}}
|
| 699 |
-
|
| 700 |
-
Rules:
|
| 701 |
-
- confirmed=true ONLY for facts from primary sources (company website, SEC filings, press releases)
|
| 702 |
-
- confirmed=false for inferred or secondary-source information
|
| 703 |
-
- Each finding should directly answer one of the questions above
|
| 704 |
-
- Be specific and quantitative where possible
|
| 705 |
-
- If a question cannot be answered, add it to gaps
|
| 706 |
-
- Maximum 12 findings"""
|
| 707 |
-
|
| 708 |
-
response = self.call(prompt, system, max_tokens=2000)
|
| 709 |
-
if response:
|
| 710 |
-
match = re.search(r'\{.*\}', response, re.DOTALL)
|
| 711 |
-
if match:
|
| 712 |
-
try:
|
| 713 |
-
return json.loads(match.group())
|
| 714 |
-
except Exception:
|
| 715 |
-
pass
|
| 716 |
-
return {"findings": [], "gaps": []}
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
# ============================================================
|
| 720 |
-
# Deep Competitive Intelligence Agent
|
| 721 |
-
# ============================================================
|
| 722 |
-
|
| 723 |
-
@dataclass
|
| 724 |
-
class IntelSection:
|
| 725 |
-
"""A section of the competitive intelligence report."""
|
| 726 |
-
category: str
|
| 727 |
-
label: str
|
| 728 |
-
queries_executed: list = field(default_factory=list)
|
| 729 |
-
results: list = field(default_factory=list)
|
| 730 |
-
findings: list = field(default_factory=list)
|
| 731 |
-
gaps: list = field(default_factory=list)
|
| 732 |
-
sources: list = field(default_factory=list)
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
class CompetitorIntelAgent:
|
| 736 |
-
"""Deep competitive intelligence agent for a single competitor.
|
| 737 |
-
|
| 738 |
-
Runs structured research across 7 categories and produces
|
| 739 |
-
a markdown + JSON report with [CONFIRMED]/[SPECULATIVE] tagging.
|
| 740 |
-
|
| 741 |
-
Usage:
|
| 742 |
-
agent = CompetitorIntelAgent("Ekso Bionics")
|
| 743 |
-
report = agent.run() # built-in analysis
|
| 744 |
-
report = agent.run(use_external_llm=True) # + OpenRouter
|
| 745 |
-
"""
|
| 746 |
-
|
| 747 |
-
def __init__(self, company: str, search: SearchService = None, llm: LLMClient = None):
|
| 748 |
-
self.company = company
|
| 749 |
-
self.search = search or SearchService()
|
| 750 |
-
self.llm = llm or LLMClient()
|
| 751 |
-
self.sections: dict[str, IntelSection] = {}
|
| 752 |
-
self.output_dir = RESEARCH_DIR / "intel"
|
| 753 |
-
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 754 |
-
|
| 755 |
-
def run(self, categories: list[str] = None, use_external_llm: bool = False,
|
| 756 |
-
delay: float = 1.0, max_results: int = 10) -> Path:
|
| 757 |
-
"""Run deep competitive intelligence and generate report.
|
| 758 |
-
|
| 759 |
-
Built-in analysis (source scoring, dedup, gap detection) always runs.
|
| 760 |
-
|
| 761 |
-
Args:
|
| 762 |
-
categories: Which categories to research (default: all 7)
|
| 763 |
-
use_external_llm: Also use external LLM (OpenRouter) for enhanced analysis
|
| 764 |
-
delay: Delay between searches in seconds (rate limiting)
|
| 765 |
-
max_results: Max results per search query
|
| 766 |
-
|
| 767 |
-
Returns: Path to generated markdown report
|
| 768 |
-
"""
|
| 769 |
-
cats = categories or list(DEEP_INTEL_CATEGORIES.keys())
|
| 770 |
-
|
| 771 |
-
total_queries = sum(
|
| 772 |
-
len(DEEP_INTEL_CATEGORIES[c]["queries"])
|
| 773 |
-
for c in cats if c in DEEP_INTEL_CATEGORIES
|
| 774 |
-
)
|
| 775 |
-
|
| 776 |
-
print(f"\n{'='*60}")
|
| 777 |
-
print(f"Deep Competitive Intelligence: {self.company}")
|
| 778 |
-
print(f"Categories: {len(cats)} | Queries: ~{total_queries}")
|
| 779 |
-
print(f"Analysis: built-in{' + external LLM' if use_external_llm and self.llm.enabled else ''}")
|
| 780 |
-
print(f"{'='*60}\n")
|
| 781 |
-
|
| 782 |
-
for cat_key in cats:
|
| 783 |
-
cat = DEEP_INTEL_CATEGORIES.get(cat_key)
|
| 784 |
-
if not cat:
|
| 785 |
-
print(f"[SKIP] Unknown category: {cat_key}")
|
| 786 |
-
continue
|
| 787 |
-
|
| 788 |
-
section = IntelSection(category=cat_key, label=cat["label"])
|
| 789 |
-
self._research_category(section, cat, use_external_llm, delay, max_results)
|
| 790 |
-
self.sections[cat_key] = section
|
| 791 |
-
|
| 792 |
-
report_path = self._generate_report(use_external_llm)
|
| 793 |
-
self._save_data()
|
| 794 |
-
|
| 795 |
-
print(f"\n{'='*60}")
|
| 796 |
-
print(f"Report: {report_path}")
|
| 797 |
-
total_findings = sum(len(s.findings) for s in self.sections.values())
|
| 798 |
-
total_gaps = sum(len(s.gaps) for s in self.sections.values())
|
| 799 |
-
print(f"Findings: {total_findings} | Gaps: {total_gaps}")
|
| 800 |
-
print(f"{'='*60}\n")
|
| 801 |
-
|
| 802 |
-
return report_path
|
| 803 |
-
|
| 804 |
-
def _research_category(self, section: IntelSection, cat: dict,
|
| 805 |
-
use_external_llm: bool, delay: float, max_results: int):
|
| 806 |
-
"""Research a single category: generate queries, search, analyze.
|
| 807 |
-
|
| 808 |
-
Built-in analysis (source scoring, dedup, gap detection) always runs.
|
| 809 |
-
External LLM (OpenRouter) is an optional enhancement on top.
|
| 810 |
-
"""
|
| 811 |
-
print(f"\n--- {section.label} ---")
|
| 812 |
-
|
| 813 |
-
queries = [q.format(company=self.company) for q in cat["queries"]]
|
| 814 |
-
|
| 815 |
-
# External LLM can generate additional targeted queries
|
| 816 |
-
if use_external_llm and self.llm.enabled:
|
| 817 |
-
extra = self.llm.generate_category_queries(self.company, section.label)
|
| 818 |
-
if extra:
|
| 819 |
-
queries.extend(extra)
|
| 820 |
-
print(f" [EXTERNAL LLM] +{len(extra)} additional queries")
|
| 821 |
-
|
| 822 |
-
for query in queries:
|
| 823 |
-
print(f" [SEARCH] {query}")
|
| 824 |
-
try:
|
| 825 |
-
results = self.search.search(query, max_results=max_results, save=True)
|
| 826 |
-
section.queries_executed.append(query)
|
| 827 |
-
section.results.extend(results)
|
| 828 |
-
for r in results:
|
| 829 |
-
if r.url and r.url not in section.sources:
|
| 830 |
-
section.sources.append(r.url)
|
| 831 |
-
print(f" -> {len(results)} results")
|
| 832 |
-
except Exception as e:
|
| 833 |
-
print(f" -> Error: {e}")
|
| 834 |
-
|
| 835 |
-
if delay > 0:
|
| 836 |
-
time.sleep(delay)
|
| 837 |
-
|
| 838 |
-
# Always run built-in analysis (no external API needed)
|
| 839 |
-
section.findings = self._analyze_section(section)
|
| 840 |
-
section.gaps = self._detect_gaps(section)
|
| 841 |
-
|
| 842 |
-
confirmed = sum(1 for f in section.findings if f.get("confirmed"))
|
| 843 |
-
speculative = len(section.findings) - confirmed
|
| 844 |
-
print(f" [ANALYSIS] {len(section.findings)} findings ({confirmed} confirmed, {speculative} speculative)")
|
| 845 |
-
if section.gaps:
|
| 846 |
-
print(f" [GAPS] {len(section.gaps)}: {', '.join(g['text'] for g in section.gaps[:3])}")
|
| 847 |
-
|
| 848 |
-
# LLM synthesis — automatic when OpenRouter is available
|
| 849 |
-
if self.llm.enabled and section.results:
|
| 850 |
-
print(f" [SYNTHESIS] Synthesizing {section.label}...")
|
| 851 |
-
synthesis = self.llm.synthesize_intel(
|
| 852 |
-
self.company, section.category, section.label, section.results
|
| 853 |
-
)
|
| 854 |
-
synth_findings = synthesis.get("findings", [])
|
| 855 |
-
synth_gaps = synthesis.get("gaps", [])
|
| 856 |
-
|
| 857 |
-
if synth_findings:
|
| 858 |
-
# Synthesized findings are distilled answers — use them as primary.
|
| 859 |
-
# Append any built-in findings from sources the LLM missed.
|
| 860 |
-
synth_sources = {f.get("source", "") for f in synth_findings if f.get("source")}
|
| 861 |
-
for bf in section.findings:
|
| 862 |
-
if bf.get("source") and bf["source"] not in synth_sources:
|
| 863 |
-
synth_findings.append(bf)
|
| 864 |
-
section.findings = synth_findings
|
| 865 |
-
|
| 866 |
-
for f in synth_findings:
|
| 867 |
-
if isinstance(f, dict):
|
| 868 |
-
tag = "[CONFIRMED]" if f.get("confirmed") else "[SPECULATIVE]"
|
| 869 |
-
print(f" {tag} {f.get('text', '')[:80]}")
|
| 870 |
-
|
| 871 |
-
# Merge gaps from synthesis with built-in gaps
|
| 872 |
-
existing_gaps = {g["text"].lower() for g in section.gaps}
|
| 873 |
-
for sg in synth_gaps:
|
| 874 |
-
gap_text = sg.get("text", sg) if isinstance(sg, dict) else sg
|
| 875 |
-
if gap_text.lower() not in existing_gaps:
|
| 876 |
-
section.gaps.append({"text": gap_text})
|
| 877 |
-
|
| 878 |
-
def _analyze_section(self, section: IntelSection) -> list[dict]:
|
| 879 |
-
"""Built-in smart analysis: source scoring, dedup, structured extraction.
|
| 880 |
-
|
| 881 |
-
This runs without any external LLM. It:
|
| 882 |
-
1. Filters results to those mentioning the company
|
| 883 |
-
2. Scores each source as confirmed (primary) or speculative (secondary)
|
| 884 |
-
3. Deduplicates by title similarity
|
| 885 |
-
4. Returns structured findings capped at 15 per section
|
| 886 |
-
"""
|
| 887 |
-
findings = []
|
| 888 |
-
seen_keys = set()
|
| 889 |
-
aliases = self._get_aliases()
|
| 890 |
-
|
| 891 |
-
for r in section.results:
|
| 892 |
-
text_lower = f"{r.title} {r.snippet}".lower()
|
| 893 |
-
|
| 894 |
-
# Only include results that mention the company
|
| 895 |
-
if not any(alias in text_lower for alias in aliases):
|
| 896 |
-
continue
|
| 897 |
-
|
| 898 |
-
# Deduplicate by normalized title prefix
|
| 899 |
-
dedup_key = re.sub(r'[^a-z0-9]', '', r.title.lower()[:50])
|
| 900 |
-
if dedup_key in seen_keys:
|
| 901 |
-
continue
|
| 902 |
-
seen_keys.add(dedup_key)
|
| 903 |
-
|
| 904 |
-
# Score source quality
|
| 905 |
-
confirmed = self._is_primary_source(r.url)
|
| 906 |
-
|
| 907 |
-
# Clean finding text
|
| 908 |
-
title = r.title.strip()
|
| 909 |
-
snippet = r.snippet.strip()[:250]
|
| 910 |
-
finding_text = f"{title}: {snippet}" if snippet else title
|
| 911 |
-
|
| 912 |
-
findings.append({
|
| 913 |
-
"text": finding_text,
|
| 914 |
-
"source": r.url,
|
| 915 |
-
"confirmed": confirmed,
|
| 916 |
-
})
|
| 917 |
-
|
| 918 |
-
# Sort: confirmed first, then by text length (richer content first)
|
| 919 |
-
findings.sort(key=lambda f: (not f["confirmed"], -len(f["text"])))
|
| 920 |
-
return findings[:15]
|
| 921 |
-
|
| 922 |
-
def _is_primary_source(self, url: str) -> bool:
|
| 923 |
-
"""Score whether a URL is a primary/authoritative source.
|
| 924 |
-
|
| 925 |
-
Primary = company's own site, regulatory filings, financial databases,
|
| 926 |
-
established industry publications, review platforms.
|
| 927 |
-
"""
|
| 928 |
-
if not url:
|
| 929 |
-
return False
|
| 930 |
-
url_lower = url.lower()
|
| 931 |
-
|
| 932 |
-
# Check known primary domains
|
| 933 |
-
for domain in PRIMARY_SOURCE_DOMAINS:
|
| 934 |
-
if domain in url_lower:
|
| 935 |
-
return True
|
| 936 |
-
|
| 937 |
-
# Check if it's the company's own domain
|
| 938 |
-
for alias in self._get_aliases():
|
| 939 |
-
# Normalize: "ekso bionics" -> "eksobionics", "ekso"
|
| 940 |
-
slug = alias.replace(" ", "")
|
| 941 |
-
if len(slug) >= 4 and slug in url_lower.split("/")[2] if len(url_lower.split("/")) > 2 else False:
|
| 942 |
-
return True
|
| 943 |
-
|
| 944 |
-
return False
|
| 945 |
-
|
| 946 |
-
def _detect_gaps(self, section: IntelSection) -> list[dict]:
|
| 947 |
-
"""Detect missing data points for this category.
|
| 948 |
-
|
| 949 |
-
Checks findings text against expected patterns per category.
|
| 950 |
-
Returns list of gap dicts for fields with no matching data.
|
| 951 |
-
"""
|
| 952 |
-
expected = CATEGORY_EXPECTED.get(section.category, {})
|
| 953 |
-
if not expected:
|
| 954 |
-
return []
|
| 955 |
-
|
| 956 |
-
# Build text corpus from company-relevant results only
|
| 957 |
-
aliases = self._get_aliases()
|
| 958 |
-
relevant_text = " ".join(
|
| 959 |
-
f"{r.title} {r.snippet}"
|
| 960 |
-
for r in section.results
|
| 961 |
-
if any(a in f"{r.title} {r.snippet}".lower() for a in aliases)
|
| 962 |
-
)
|
| 963 |
-
|
| 964 |
-
if not relevant_text:
|
| 965 |
-
return [{"text": f"No relevant results found for {section.label}"}]
|
| 966 |
-
|
| 967 |
-
relevant_lower = relevant_text.lower()
|
| 968 |
-
gaps = []
|
| 969 |
-
for field_name, patterns in expected.items():
|
| 970 |
-
found = any(
|
| 971 |
-
re.search(p, relevant_lower, re.IGNORECASE)
|
| 972 |
-
for p in patterns
|
| 973 |
-
)
|
| 974 |
-
if not found:
|
| 975 |
-
label = field_name.replace("_", " ").replace("/", " / ")
|
| 976 |
-
gaps.append({"text": f"No data found for: {label}"})
|
| 977 |
-
|
| 978 |
-
return gaps
|
| 979 |
-
|
| 980 |
-
def _get_aliases(self) -> list[str]:
|
| 981 |
-
"""Get lowercase company aliases for text matching."""
|
| 982 |
-
info = COMPANY_DEFINITIONS.get(self.company, {})
|
| 983 |
-
aliases = info.get("aliases", [])
|
| 984 |
-
if not aliases:
|
| 985 |
-
aliases = [self.company.lower()]
|
| 986 |
-
return aliases
|
| 987 |
-
|
| 988 |
-
def _generate_report(self, use_external_llm: bool) -> Path:
|
| 989 |
-
"""Generate structured markdown report."""
|
| 990 |
-
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 991 |
-
slug = self.company.lower().replace(" ", "-").replace("/", "-")
|
| 992 |
-
slug = "".join(c for c in slug if c.isalnum() or c == "-")
|
| 993 |
-
|
| 994 |
-
report_path = self.output_dir / f"{timestamp}_{slug}_intel.md"
|
| 995 |
-
|
| 996 |
-
method = "Built-in analysis"
|
| 997 |
-
if LLM_ENABLED:
|
| 998 |
-
method += " + LLM synthesis (OpenRouter)"
|
| 999 |
-
if use_external_llm:
|
| 1000 |
-
method += " + extra query generation"
|
| 1001 |
-
|
| 1002 |
-
lines = [
|
| 1003 |
-
f"# Competitive Intelligence: {self.company}",
|
| 1004 |
-
"",
|
| 1005 |
-
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')} ",
|
| 1006 |
-
f"**Method:** {method} ",
|
| 1007 |
-
f"**Searches:** {sum(len(s.queries_executed) for s in self.sections.values())} ",
|
| 1008 |
-
f"**Sources:** {sum(len(s.sources) for s in self.sections.values())} unique URLs",
|
| 1009 |
-
"",
|
| 1010 |
-
"> **Legend:** [CONFIRMED] = from primary/verified source | [SPECULATIVE] = inferred or unverified",
|
| 1011 |
-
"",
|
| 1012 |
-
"---",
|
| 1013 |
-
]
|
| 1014 |
-
|
| 1015 |
-
for section in self.sections.values():
|
| 1016 |
-
lines.append("")
|
| 1017 |
-
lines.append(f"## {section.label}")
|
| 1018 |
-
lines.append("")
|
| 1019 |
-
|
| 1020 |
-
if not section.findings:
|
| 1021 |
-
lines.append("*No findings. Try broader queries or `--external-llm` for additional analysis.*")
|
| 1022 |
-
lines.append("")
|
| 1023 |
-
continue
|
| 1024 |
-
|
| 1025 |
-
for f in section.findings:
|
| 1026 |
-
if isinstance(f, dict):
|
| 1027 |
-
tag = "[CONFIRMED]" if f.get("confirmed") else "[SPECULATIVE]"
|
| 1028 |
-
text = f.get("text", "")
|
| 1029 |
-
source = f.get("source", "")
|
| 1030 |
-
lines.append(f"- **{tag}** {text}")
|
| 1031 |
-
if source:
|
| 1032 |
-
lines.append(f" - Source: {source}")
|
| 1033 |
-
else:
|
| 1034 |
-
lines.append(f"- {f}")
|
| 1035 |
-
|
| 1036 |
-
if section.gaps:
|
| 1037 |
-
lines.append("")
|
| 1038 |
-
lines.append("**Knowledge Gaps:**")
|
| 1039 |
-
for gap in section.gaps:
|
| 1040 |
-
gap_text = gap.get("text", gap) if isinstance(gap, dict) else gap
|
| 1041 |
-
lines.append(f"- [ ] {gap_text}")
|
| 1042 |
-
|
| 1043 |
-
lines.append("")
|
| 1044 |
-
|
| 1045 |
-
if section.sources:
|
| 1046 |
-
lines.append(f"<details><summary>Sources ({len(section.sources)} URLs)</summary>")
|
| 1047 |
-
lines.append("")
|
| 1048 |
-
for url in section.sources[:10]:
|
| 1049 |
-
lines.append(f"- {url}")
|
| 1050 |
-
if len(section.sources) > 10:
|
| 1051 |
-
lines.append(f"- ... and {len(section.sources) - 10} more")
|
| 1052 |
-
lines.append("")
|
| 1053 |
-
lines.append("</details>")
|
| 1054 |
-
lines.append("")
|
| 1055 |
-
|
| 1056 |
-
# Summary
|
| 1057 |
-
lines.extend(["---", "", "## Summary", ""])
|
| 1058 |
-
|
| 1059 |
-
total_findings = sum(len(s.findings) for s in self.sections.values())
|
| 1060 |
-
confirmed = sum(
|
| 1061 |
-
sum(1 for f in s.findings if isinstance(f, dict) and f.get("confirmed"))
|
| 1062 |
-
for s in self.sections.values()
|
| 1063 |
-
)
|
| 1064 |
-
speculative = total_findings - confirmed
|
| 1065 |
-
|
| 1066 |
-
lines.append(f"| Metric | Count |")
|
| 1067 |
-
lines.append(f"|--------|-------|")
|
| 1068 |
-
lines.append(f"| Total findings | {total_findings} |")
|
| 1069 |
-
lines.append(f"| Confirmed | {confirmed} |")
|
| 1070 |
-
lines.append(f"| Speculative | {speculative} |")
|
| 1071 |
-
lines.append(f"| Categories | {len(self.sections)} |")
|
| 1072 |
-
lines.append("")
|
| 1073 |
-
|
| 1074 |
-
all_gaps = []
|
| 1075 |
-
for s in self.sections.values():
|
| 1076 |
-
for g in s.gaps:
|
| 1077 |
-
gap_text = g.get("text", g) if isinstance(g, dict) else g
|
| 1078 |
-
all_gaps.append(f"{s.label}: {gap_text}")
|
| 1079 |
-
|
| 1080 |
-
if all_gaps:
|
| 1081 |
-
lines.append("### Outstanding Gaps")
|
| 1082 |
-
lines.append("")
|
| 1083 |
-
for gap in all_gaps:
|
| 1084 |
-
lines.append(f"- [ ] {gap}")
|
| 1085 |
-
lines.append("")
|
| 1086 |
-
|
| 1087 |
-
with open(report_path, "w") as f:
|
| 1088 |
-
f.write("\n".join(lines))
|
| 1089 |
-
|
| 1090 |
-
return report_path
|
| 1091 |
-
|
| 1092 |
-
def _save_data(self):
|
| 1093 |
-
"""Save structured intel data as JSON alongside the report."""
|
| 1094 |
-
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 1095 |
-
slug = self.company.lower().replace(" ", "-").replace("/", "-")
|
| 1096 |
-
slug = "".join(c for c in slug if c.isalnum() or c == "-")
|
| 1097 |
-
|
| 1098 |
-
data = {
|
| 1099 |
-
"company": self.company,
|
| 1100 |
-
"generated": datetime.now().isoformat(),
|
| 1101 |
-
"sections": {},
|
| 1102 |
-
}
|
| 1103 |
-
|
| 1104 |
-
for cat_key, section in self.sections.items():
|
| 1105 |
-
data["sections"][cat_key] = {
|
| 1106 |
-
"label": section.label,
|
| 1107 |
-
"queries_executed": section.queries_executed,
|
| 1108 |
-
"finding_count": len(section.findings),
|
| 1109 |
-
"findings": section.findings,
|
| 1110 |
-
"gaps": section.gaps,
|
| 1111 |
-
"source_count": len(section.sources),
|
| 1112 |
-
"sources": section.sources[:20],
|
| 1113 |
-
}
|
| 1114 |
-
|
| 1115 |
-
json_path = self.output_dir / f"{timestamp}_{slug}_intel.json"
|
| 1116 |
-
with open(json_path, "w") as f:
|
| 1117 |
-
json.dump(data, f, indent=2)
|
| 1118 |
-
|
| 1119 |
-
|
| 1120 |
-
# ============================================================
|
| 1121 |
-
# Competitor Extraction (Data Processing)
|
| 1122 |
-
# ============================================================
|
| 1123 |
-
|
| 1124 |
-
# Company definitions for extraction
|
| 1125 |
-
COMPANY_DEFINITIONS = {
|
| 1126 |
-
"Hocoma": {"aliases": ["hocoma", "dih", "lokomat"], "country": "Switzerland", "product": "Lokomat", "status": "collapsed", "verified": True},
|
| 1127 |
-
"Ekso Bionics": {"aliases": ["ekso", "eksobionics", "eksonr"], "country": "USA", "product": "EksoNR", "status": "weak", "verified": True},
|
| 1128 |
-
"Cyberdyne": {"aliases": ["cyberdyne", "hal exoskeleton"], "country": "Japan", "product": "HAL", "status": "strong", "verified": True},
|
| 1129 |
-
"Lifeward": {"aliases": ["lifeward", "rewalk", "alterg"], "country": "Israel/USA", "product": "ReWalk 7", "status": "consolidating", "verified": True},
|
| 1130 |
-
"Fourier": {"aliases": ["fourier", "fourier intelligence"], "country": "China", "product": "X1, M2", "status": "growing", "verified": True},
|
| 1131 |
-
"Myomo": {"aliases": ["myomo", "myopro"], "country": "USA", "product": "MyoPro", "status": "stable", "verified": False},
|
| 1132 |
-
"Bionik": {"aliases": ["bionik", "inmotion"], "country": "Canada", "product": "InMotion", "status": "stable", "verified": False},
|
| 1133 |
-
"Wandercraft": {"aliases": ["wandercraft", "atalante"], "country": "France", "product": "Atalante X", "status": "growing", "verified": False},
|
| 1134 |
-
}
|
| 1135 |
-
|
| 1136 |
-
# Status detection keywords (order matters)
|
| 1137 |
-
STATUS_KEYWORDS = [
|
| 1138 |
-
("collapsed", ["bankrupt", "delisted", "suspended", "collapse", "shut down", "ceased", "nasdaq delisted"]),
|
| 1139 |
-
("weak", ["52-week low", "struggling", "losses", "declining", "layoffs"]),
|
| 1140 |
-
("growing", ["series e", "series d", "series c", "funding round", "$109 million"]),
|
| 1141 |
-
("consolidating", ["acquired", "merger", "acquisition"]),
|
| 1142 |
-
("strong", ["leader", "dominant", "profitable"]),
|
| 1143 |
-
]
|
| 1144 |
-
|
| 1145 |
-
# Extraction patterns
|
| 1146 |
-
DATE_PATTERN = re.compile(
|
| 1147 |
-
r'((?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4})'
|
| 1148 |
-
r'|(\d{4}-\d{2}-\d{2})'
|
| 1149 |
-
r'|(\d{4}-\d{2})'
|
| 1150 |
-
r'|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4})'
|
| 1151 |
-
)
|
| 1152 |
-
MONEY_PATTERN = re.compile(r'\$[\d,]+(?:\.\d+)?(?:\s*(?:million|billion|M|B))?|\d+(?:\.\d+)?\s*(?:million|billion)', re.IGNORECASE)
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
class CompetitorExtractor:
|
| 1156 |
-
"""Extract structured competitor data from research results."""
|
| 1157 |
-
|
| 1158 |
-
def __init__(self, research_dir: Path = RESEARCH_DIR,
|
| 1159 |
-
output_file: Path = None):
|
| 1160 |
-
self.research_dir = research_dir
|
| 1161 |
-
self.output_file = output_file or (DATA_DIR / "competitors.json")
|
| 1162 |
-
|
| 1163 |
-
def load_research_files(self) -> list[dict]:
|
| 1164 |
-
"""Load all JSON research files."""
|
| 1165 |
-
results = []
|
| 1166 |
-
if not self.research_dir.exists():
|
| 1167 |
-
return results
|
| 1168 |
-
|
| 1169 |
-
for json_file in self.research_dir.glob("*.json"):
|
| 1170 |
-
if json_file.name.startswith("."):
|
| 1171 |
-
continue
|
| 1172 |
-
try:
|
| 1173 |
-
with open(json_file) as f:
|
| 1174 |
-
data = json.load(f)
|
| 1175 |
-
data["_source_file"] = json_file.name
|
| 1176 |
-
results.append(data)
|
| 1177 |
-
except Exception as e:
|
| 1178 |
-
print(f"Error loading {json_file}: {e}")
|
| 1179 |
-
|
| 1180 |
-
return results
|
| 1181 |
-
|
| 1182 |
-
def find_mentions(self, text: str) -> list[str]:
|
| 1183 |
-
"""Find which companies are mentioned in text."""
|
| 1184 |
-
text_lower = text.lower()
|
| 1185 |
-
mentioned = []
|
| 1186 |
-
for company, info in COMPANY_DEFINITIONS.items():
|
| 1187 |
-
if any(alias in text_lower for alias in info["aliases"]):
|
| 1188 |
-
mentioned.append(company)
|
| 1189 |
-
return mentioned
|
| 1190 |
-
|
| 1191 |
-
def normalize_date(self, date_str: str) -> str | None:
|
| 1192 |
-
"""Normalize date string to YYYY-MM-DD. Returns None for bogus dates."""
|
| 1193 |
-
formats = ["%B %d, %Y", "%B %d %Y", "%b %d, %Y", "%b %d %Y", "%Y-%m-%d", "%Y-%m"]
|
| 1194 |
-
for fmt in formats:
|
| 1195 |
-
try:
|
| 1196 |
-
dt = datetime.strptime(date_str.strip(), fmt)
|
| 1197 |
-
if dt.year < 2010:
|
| 1198 |
-
return None
|
| 1199 |
-
return dt.strftime("%Y-%m-%d")
|
| 1200 |
-
except:
|
| 1201 |
-
pass
|
| 1202 |
-
return date_str
|
| 1203 |
-
|
| 1204 |
-
def extract_events(self, text: str, company: str) -> list[dict]:
|
| 1205 |
-
"""Extract events (date + context) from text."""
|
| 1206 |
-
events = []
|
| 1207 |
-
aliases = COMPANY_DEFINITIONS[company]["aliases"]
|
| 1208 |
-
|
| 1209 |
-
for match in DATE_PATTERN.finditer(text):
|
| 1210 |
-
date_str = match.group(0)
|
| 1211 |
-
if not date_str:
|
| 1212 |
-
continue
|
| 1213 |
-
|
| 1214 |
-
start = max(0, match.start() - 50)
|
| 1215 |
-
end = min(len(text), match.end() + 150)
|
| 1216 |
-
context = text[start:end]
|
| 1217 |
-
|
| 1218 |
-
normalized = self.normalize_date(date_str)
|
| 1219 |
-
if normalized and any(alias in context.lower() for alias in aliases):
|
| 1220 |
-
events.append({
|
| 1221 |
-
"date": normalized,
|
| 1222 |
-
"context": context.strip()
|
| 1223 |
-
})
|
| 1224 |
-
|
| 1225 |
-
return events
|
| 1226 |
-
|
| 1227 |
-
def detect_status(self, snippets: list[str], default: str) -> str:
|
| 1228 |
-
"""Detect status from snippets."""
|
| 1229 |
-
text = " ".join(snippets).lower()
|
| 1230 |
-
for status, keywords in STATUS_KEYWORDS:
|
| 1231 |
-
if any(kw.lower() in text for kw in keywords):
|
| 1232 |
-
return status
|
| 1233 |
-
return default
|
| 1234 |
-
|
| 1235 |
-
def extract_stock(self, snippets: list[str]) -> Optional[float]:
|
| 1236 |
-
"""Extract stock price."""
|
| 1237 |
-
for snippet in snippets:
|
| 1238 |
-
match = re.search(r'\$(\d+\.?\d*)', snippet)
|
| 1239 |
-
if match and float(match.group(1)) < 1000:
|
| 1240 |
-
return float(match.group(1))
|
| 1241 |
-
return None
|
| 1242 |
-
|
| 1243 |
-
def extract_funding(self, money_mentions: list[str]) -> Optional[int]:
|
| 1244 |
-
"""Extract funding amount."""
|
| 1245 |
-
for m in money_mentions:
|
| 1246 |
-
match = re.search(r'(\d+)\s*(?:million|M)', m, re.IGNORECASE)
|
| 1247 |
-
if match:
|
| 1248 |
-
return int(match.group(1)) * 1_000_000
|
| 1249 |
-
match = re.search(r'(\d+\.?\d*)\s*(?:billion|B)', m, re.IGNORECASE)
|
| 1250 |
-
if match:
|
| 1251 |
-
return int(float(match.group(1)) * 1_000_000_000)
|
| 1252 |
-
return None
|
| 1253 |
-
|
| 1254 |
-
def _load_intel_findings(self) -> dict[str, list[dict]]:
|
| 1255 |
-
"""Load confirmed findings from Deep Intel reports, grouped by company.
|
| 1256 |
-
|
| 1257 |
-
Returns: {company_name: [{"text": ..., "confirmed": bool, "source": ...}, ...]}
|
| 1258 |
-
Only loads the latest report per company.
|
| 1259 |
-
"""
|
| 1260 |
-
intel_dir = self.research_dir / "intel"
|
| 1261 |
-
if not intel_dir.exists():
|
| 1262 |
-
return {}
|
| 1263 |
-
|
| 1264 |
-
findings_by_company: dict[str, list[dict]] = {}
|
| 1265 |
-
seen_companies: set[str] = set()
|
| 1266 |
-
|
| 1267 |
-
for json_file in sorted(intel_dir.glob("*_intel.json"), reverse=True):
|
| 1268 |
-
try:
|
| 1269 |
-
with open(json_file) as f:
|
| 1270 |
-
data = json.load(f)
|
| 1271 |
-
company = data.get("company", "")
|
| 1272 |
-
if not company or company in seen_companies:
|
| 1273 |
-
continue
|
| 1274 |
-
seen_companies.add(company)
|
| 1275 |
-
|
| 1276 |
-
all_findings = []
|
| 1277 |
-
for section in data.get("sections", {}).values():
|
| 1278 |
-
for finding in section.get("findings", []):
|
| 1279 |
-
if isinstance(finding, dict) and finding.get("text"):
|
| 1280 |
-
all_findings.append(finding)
|
| 1281 |
-
elif isinstance(finding, str) and finding:
|
| 1282 |
-
all_findings.append({"text": finding, "confirmed": False, "source": ""})
|
| 1283 |
-
|
| 1284 |
-
if all_findings:
|
| 1285 |
-
findings_by_company[company] = all_findings
|
| 1286 |
-
except Exception:
|
| 1287 |
-
pass
|
| 1288 |
-
|
| 1289 |
-
return findings_by_company
|
| 1290 |
-
|
| 1291 |
-
def _extract_intel_opportunities(self, intel_findings: dict[str, list[dict]]) -> list[dict]:
|
| 1292 |
-
"""Extract opportunity signals from Deep Intel confirmed findings.
|
| 1293 |
-
|
| 1294 |
-
Scans for: vulnerability keywords, financial weakness, market gaps,
|
| 1295 |
-
technology gaps, regulatory issues, customer complaints.
|
| 1296 |
-
"""
|
| 1297 |
-
opportunities = []
|
| 1298 |
-
|
| 1299 |
-
vuln_patterns = [
|
| 1300 |
-
(r'(?:layoff|restructur|downsiz|headcount.?reduc)', "workforce_cut", 2),
|
| 1301 |
-
(r'(?:delist|stock.?(?:drop|fall|declin)|52.week.low|penny.stock)', "financial_distress", 1),
|
| 1302 |
-
(r'(?:FDA.?(?:reject|warning|recall)|regulatory.?(?:issue|fail|delay))', "regulatory_issue", 2),
|
| 1303 |
-
(r'(?:bankrupt|insolvenc|cease.?operat|wind.?down|liquidat)', "collapse", 1),
|
| 1304 |
-
(r'(?:customer.?complain|negative.?review|churn|losing.?customer)', "customer_risk", 2),
|
| 1305 |
-
(r'(?:legacy|technical.?debt|outdated|proprietary.?lock)', "tech_weakness", 3),
|
| 1306 |
-
(r'(?:no.?AI|lack.?(?:of.?)?(?:data|machine.learn|personali))', "ai_gap", 2),
|
| 1307 |
-
]
|
| 1308 |
-
|
| 1309 |
-
for company, findings in intel_findings.items():
|
| 1310 |
-
confirmed = [f for f in findings if f.get("confirmed")]
|
| 1311 |
-
all_text = " ".join(f["text"] for f in confirmed).lower() if confirmed else ""
|
| 1312 |
-
all_text_full = " ".join(f["text"] for f in findings).lower()
|
| 1313 |
-
|
| 1314 |
-
for pattern, opp_type, priority in vuln_patterns:
|
| 1315 |
-
# Check confirmed findings first (higher weight)
|
| 1316 |
-
if re.search(pattern, all_text, re.IGNORECASE):
|
| 1317 |
-
match_finding = next(
|
| 1318 |
-
(f for f in confirmed if re.search(pattern, f["text"], re.IGNORECASE)),
|
| 1319 |
-
None
|
| 1320 |
-
)
|
| 1321 |
-
if match_finding:
|
| 1322 |
-
opportunities.append({
|
| 1323 |
-
"type": opp_type,
|
| 1324 |
-
"text": f"{company}: {match_finding['text'][:120]}",
|
| 1325 |
-
"priority": priority,
|
| 1326 |
-
"confirmed": True,
|
| 1327 |
-
"source": match_finding.get("source", ""),
|
| 1328 |
-
"company": company,
|
| 1329 |
-
})
|
| 1330 |
-
# Then speculative
|
| 1331 |
-
elif re.search(pattern, all_text_full, re.IGNORECASE):
|
| 1332 |
-
match_finding = next(
|
| 1333 |
-
(f for f in findings if re.search(pattern, f["text"], re.IGNORECASE)),
|
| 1334 |
-
None
|
| 1335 |
-
)
|
| 1336 |
-
if match_finding:
|
| 1337 |
-
opportunities.append({
|
| 1338 |
-
"type": opp_type,
|
| 1339 |
-
"text": f"{company}: {match_finding['text'][:120]}",
|
| 1340 |
-
"priority": priority + 1,
|
| 1341 |
-
"confirmed": False,
|
| 1342 |
-
"source": match_finding.get("source", ""),
|
| 1343 |
-
"company": company,
|
| 1344 |
-
})
|
| 1345 |
-
|
| 1346 |
-
return opportunities
|
| 1347 |
-
|
| 1348 |
-
def _load_sota_tech_signals(self) -> list[dict]:
|
| 1349 |
-
"""Load tech advantage signals from SOTA knowledge base.
|
| 1350 |
-
|
| 1351 |
-
Reads data/sota/knowledge_base.json, extracts high-priority
|
| 1352 |
-
techniques and key principles as opportunity items.
|
| 1353 |
-
"""
|
| 1354 |
-
kb_path = self.research_dir / "sota" / "knowledge_base.json"
|
| 1355 |
-
if not kb_path.exists():
|
| 1356 |
-
return []
|
| 1357 |
-
|
| 1358 |
-
try:
|
| 1359 |
-
with open(kb_path) as f:
|
| 1360 |
-
kb = json.load(f)
|
| 1361 |
-
except Exception:
|
| 1362 |
-
return []
|
| 1363 |
-
|
| 1364 |
-
signals = []
|
| 1365 |
-
|
| 1366 |
-
# High-priority techniques → tech advantages (priority 2)
|
| 1367 |
-
for t in kb.get("techniques", []):
|
| 1368 |
-
if t.get("priority") == "high" and t.get("gurma_fit"):
|
| 1369 |
-
signals.append({
|
| 1370 |
-
"type": "tech_advantage",
|
| 1371 |
-
"text": f"{t['name']}: {t['gurma_fit'][:120]}",
|
| 1372 |
-
"priority": 2,
|
| 1373 |
-
"confirmed": True,
|
| 1374 |
-
"company": "GURMA",
|
| 1375 |
-
})
|
| 1376 |
-
|
| 1377 |
-
# Key principles → strategic validation (priority 3, capped at 2)
|
| 1378 |
-
for p in kb.get("key_principles", [])[:2]:
|
| 1379 |
-
if p.get("principle"):
|
| 1380 |
-
signals.append({
|
| 1381 |
-
"type": "tech_principle",
|
| 1382 |
-
"text": f"{p['principle']}: {p.get('detail', '')[:100]}",
|
| 1383 |
-
"priority": 3,
|
| 1384 |
-
"confirmed": True,
|
| 1385 |
-
"company": "GURMA",
|
| 1386 |
-
})
|
| 1387 |
-
|
| 1388 |
-
return signals
|
| 1389 |
-
|
| 1390 |
-
def _opportunity_changed(self, new_opps: list[dict], existing: dict) -> bool:
|
| 1391 |
-
"""Decide whether new opportunity data is materially different from existing.
|
| 1392 |
-
|
| 1393 |
-
Returns True if the widget deserves an update. Criteria:
|
| 1394 |
-
- New opportunity types appeared (e.g. a collapse that wasn't there)
|
| 1395 |
-
- Priority-1 signals changed
|
| 1396 |
-
- >50% of top points are different
|
| 1397 |
-
"""
|
| 1398 |
-
existing_points = set(existing.get("points", []))
|
| 1399 |
-
new_points = set(o["text"] for o in new_opps[:4])
|
| 1400 |
-
|
| 1401 |
-
if not existing_points:
|
| 1402 |
-
return True
|
| 1403 |
-
|
| 1404 |
-
# Check if any priority-1 signals are new
|
| 1405 |
-
new_p1_types = {o["type"] for o in new_opps if o["priority"] == 1}
|
| 1406 |
-
old_raw = existing.get("raw_opportunities", [])
|
| 1407 |
-
old_p1_types = {o["type"] for o in old_raw if o.get("priority") == 1}
|
| 1408 |
-
if new_p1_types != old_p1_types:
|
| 1409 |
-
return True
|
| 1410 |
-
|
| 1411 |
-
# Check overlap of top points — if less than half match, it's a meaningful change
|
| 1412 |
-
overlap = existing_points & new_points
|
| 1413 |
-
if len(overlap) < len(existing_points) / 2:
|
| 1414 |
-
return True
|
| 1415 |
-
|
| 1416 |
-
return False
|
| 1417 |
-
|
| 1418 |
-
def _synthesize_opportunity_llm(self, opportunities: list[dict],
|
| 1419 |
-
competitors: list[dict]) -> Optional[dict]:
|
| 1420 |
-
"""Use LLM to synthesize a strategic opportunity headline + points.
|
| 1421 |
-
|
| 1422 |
-
Returns {headline, points} or None if LLM unavailable/fails.
|
| 1423 |
-
"""
|
| 1424 |
-
if not LLM_ENABLED:
|
| 1425 |
-
return None
|
| 1426 |
-
|
| 1427 |
-
llm = LLMClient()
|
| 1428 |
-
|
| 1429 |
-
opp_text = "\n".join(
|
| 1430 |
-
f"- [{o['type']}] {'[CONFIRMED]' if o.get('confirmed') else '[SPECULATIVE]'} {o['text']}"
|
| 1431 |
-
for o in opportunities[:12]
|
| 1432 |
-
)
|
| 1433 |
-
|
| 1434 |
-
comp_summary = "\n".join(
|
| 1435 |
-
f"- {c['name']}: status={c['status']}, "
|
| 1436 |
-
f"{'stock=$'+format(c['stock'], '.2f') if c.get('stock') else 'no stock data'}, "
|
| 1437 |
-
f"{'funding=$'+format(c['funding']/1e6, '.0f')+'M' if c.get('funding') else 'no funding data'}"
|
| 1438 |
-
for c in competitors[:8]
|
| 1439 |
-
)
|
| 1440 |
-
|
| 1441 |
-
system = (
|
| 1442 |
-
"You are a strategic advisor for GURMA.ai, a Swiss AI company "
|
| 1443 |
-
"entering rehabilitation robotics with 15 years of patient outcome "
|
| 1444 |
-
"data (not just motion data) from BAMA Teknoloji. "
|
| 1445 |
-
"You produce concise, actionable strategic assessments."
|
| 1446 |
-
)
|
| 1447 |
-
|
| 1448 |
-
prompt = f"""Based on the following competitive + technology signals and competitor data,
|
| 1449 |
-
produce a strategic opportunity assessment for GURMA.ai.
|
| 1450 |
-
|
| 1451 |
-
Signals (competitive, tech advantages, and threats):
|
| 1452 |
-
{opp_text}
|
| 1453 |
-
|
| 1454 |
-
Competitor landscape:
|
| 1455 |
-
{comp_summary}
|
| 1456 |
-
|
| 1457 |
-
Return JSON:
|
| 1458 |
-
{{
|
| 1459 |
-
"headline": "One punchy sentence (max 10 words) summarizing the #1 strategic opportunity",
|
| 1460 |
-
"points": [
|
| 1461 |
-
"Actionable insight 1 (max 20 words, include numbers where available)",
|
| 1462 |
-
"Actionable insight 2",
|
| 1463 |
-
"Actionable insight 3",
|
| 1464 |
-
"Actionable insight 4"
|
| 1465 |
-
]
|
| 1466 |
-
}}
|
| 1467 |
-
|
| 1468 |
-
Rules:
|
| 1469 |
-
- Headline should be about the OPPORTUNITY, not just a competitor's problem
|
| 1470 |
-
- Points should mix competitive windows, tech advantages, AND threats
|
| 1471 |
-
- Be specific: include dollar amounts, dates, competitor names, model/technique names
|
| 1472 |
-
- Maximum 4 points, ranked by strategic importance
|
| 1473 |
-
- confirmed signals should be weighted more heavily than speculative ones"""
|
| 1474 |
-
|
| 1475 |
-
response = llm.call(prompt, system, max_tokens=500)
|
| 1476 |
-
if response:
|
| 1477 |
-
match = re.search(r'\{.*\}', response, re.DOTALL)
|
| 1478 |
-
if match:
|
| 1479 |
-
try:
|
| 1480 |
-
result = json.loads(match.group())
|
| 1481 |
-
if result.get("headline") and result.get("points"):
|
| 1482 |
-
return result
|
| 1483 |
-
except Exception:
|
| 1484 |
-
pass
|
| 1485 |
-
return None
|
| 1486 |
-
|
| 1487 |
-
def detect_opportunities(self, competitors: list[dict], all_snippets: list[str]) -> dict:
|
| 1488 |
-
"""
|
| 1489 |
-
Detect market opportunities from competitor data + Deep Intel findings.
|
| 1490 |
-
|
| 1491 |
-
Combines: structured competitor status, Deep Intel confirmed findings,
|
| 1492 |
-
and optionally LLM synthesis for headline/points.
|
| 1493 |
-
"""
|
| 1494 |
-
opportunities = []
|
| 1495 |
-
|
| 1496 |
-
# --- Source 1: Structured competitor status (always available) ---
|
| 1497 |
-
collapsed = [c for c in competitors if c["status"] == "collapsed"]
|
| 1498 |
-
weak = [c for c in competitors if c["status"] == "weak"]
|
| 1499 |
-
|
| 1500 |
-
if collapsed:
|
| 1501 |
-
names = ", ".join(c["name"] for c in collapsed)
|
| 1502 |
-
opportunities.append({
|
| 1503 |
-
"type": "market_gap",
|
| 1504 |
-
"text": f"{names} collapsed — customers seeking alternatives",
|
| 1505 |
-
"priority": 1, "confirmed": True, "company": names,
|
| 1506 |
-
})
|
| 1507 |
-
|
| 1508 |
-
if weak:
|
| 1509 |
-
for c in weak:
|
| 1510 |
-
opp_text = f"{c['name']} financially weak"
|
| 1511 |
-
if c.get("stock"):
|
| 1512 |
-
opp_text += f" (${c['stock']:.2f})"
|
| 1513 |
-
opp_text += " — vulnerable to disruption"
|
| 1514 |
-
opportunities.append({
|
| 1515 |
-
"type": "weakness",
|
| 1516 |
-
"text": opp_text,
|
| 1517 |
-
"priority": 2, "confirmed": True, "company": c["name"],
|
| 1518 |
-
})
|
| 1519 |
-
|
| 1520 |
-
growing = [c for c in competitors if c["status"] == "growing" and c.get("funding")]
|
| 1521 |
-
for c in growing:
|
| 1522 |
-
funding_m = c["funding"] / 1_000_000
|
| 1523 |
-
opportunities.append({
|
| 1524 |
-
"type": "threat",
|
| 1525 |
-
"text": f"{c['name']} well-funded (${funding_m:.0f}M) — monitor closely",
|
| 1526 |
-
"priority": 3, "confirmed": True, "company": c["name"],
|
| 1527 |
-
})
|
| 1528 |
-
|
| 1529 |
-
# BAMA data advantage
|
| 1530 |
-
if competitors:
|
| 1531 |
-
opportunities.append({
|
| 1532 |
-
"type": "advantage",
|
| 1533 |
-
"text": "BAMA has 15 years outcome data vs. competitors' motion data",
|
| 1534 |
-
"priority": 1, "confirmed": True, "company": "BAMA",
|
| 1535 |
-
})
|
| 1536 |
-
|
| 1537 |
-
# --- Source 2: Deep Intel findings (if available) ---
|
| 1538 |
-
intel_findings = self._load_intel_findings()
|
| 1539 |
-
if intel_findings:
|
| 1540 |
-
intel_opps = self._extract_intel_opportunities(intel_findings)
|
| 1541 |
-
existing_keys = {(o.get("company", ""), o["type"]) for o in opportunities}
|
| 1542 |
-
for io in intel_opps:
|
| 1543 |
-
key = (io.get("company", ""), io["type"])
|
| 1544 |
-
if key not in existing_keys:
|
| 1545 |
-
opportunities.append(io)
|
| 1546 |
-
existing_keys.add(key)
|
| 1547 |
-
|
| 1548 |
-
# --- Source 3: SOTA KB tech signals (if available) ---
|
| 1549 |
-
sota_signals = self._load_sota_tech_signals()
|
| 1550 |
-
if sota_signals:
|
| 1551 |
-
existing_keys = {(o.get("company", ""), o["type"]) for o in opportunities}
|
| 1552 |
-
for ts in sota_signals:
|
| 1553 |
-
key = (ts.get("company", ""), ts["type"])
|
| 1554 |
-
if key not in existing_keys:
|
| 1555 |
-
opportunities.append(ts)
|
| 1556 |
-
existing_keys.add(key)
|
| 1557 |
-
|
| 1558 |
-
# Sort by priority
|
| 1559 |
-
opportunities.sort(key=lambda x: x["priority"])
|
| 1560 |
-
|
| 1561 |
-
# --- Headline + points: LLM synthesis or rule-based fallback ---
|
| 1562 |
-
llm_result = self._synthesize_opportunity_llm(opportunities, competitors)
|
| 1563 |
-
|
| 1564 |
-
if llm_result:
|
| 1565 |
-
headline = llm_result["headline"]
|
| 1566 |
-
points = llm_result["points"][:4]
|
| 1567 |
-
else:
|
| 1568 |
-
if collapsed:
|
| 1569 |
-
headline = f"{collapsed[0]['name']} collapse creates market window"
|
| 1570 |
-
elif weak:
|
| 1571 |
-
headline = "Competitor weakness creates opportunity"
|
| 1572 |
-
else:
|
| 1573 |
-
headline = "Data advantage positions GURMA.ai for growth"
|
| 1574 |
-
points = [o["text"] for o in opportunities[:4]]
|
| 1575 |
-
|
| 1576 |
-
# Build sources list
|
| 1577 |
-
sources = ["competitor"]
|
| 1578 |
-
if intel_findings:
|
| 1579 |
-
sources.append("intel")
|
| 1580 |
-
if sota_signals:
|
| 1581 |
-
sources.append("tech")
|
| 1582 |
-
if llm_result:
|
| 1583 |
-
sources.append("llm")
|
| 1584 |
-
|
| 1585 |
-
return {
|
| 1586 |
-
"headline": headline,
|
| 1587 |
-
"points": points,
|
| 1588 |
-
"detected_at": datetime.now().strftime("%Y-%m-%d"),
|
| 1589 |
-
"raw_opportunities": opportunities,
|
| 1590 |
-
"sources": sources,
|
| 1591 |
-
}
|
| 1592 |
-
|
| 1593 |
-
def load_existing_data(self) -> Optional[dict]:
|
| 1594 |
-
"""Load existing competitors.json if it exists."""
|
| 1595 |
-
if self.output_file.exists():
|
| 1596 |
-
try:
|
| 1597 |
-
with open(self.output_file) as f:
|
| 1598 |
-
return json.load(f)
|
| 1599 |
-
except:
|
| 1600 |
-
pass
|
| 1601 |
-
return None
|
| 1602 |
-
|
| 1603 |
-
def process(self) -> dict:
|
| 1604 |
-
"""Process research files and build competitors.json."""
|
| 1605 |
-
research_data = self.load_research_files()
|
| 1606 |
-
if not research_data:
|
| 1607 |
-
return {"competitors": [], "market": {}}
|
| 1608 |
-
|
| 1609 |
-
# Aggregate data per company
|
| 1610 |
-
from collections import defaultdict
|
| 1611 |
-
company_data = defaultdict(lambda: {
|
| 1612 |
-
"mentions": 0, "snippets": [], "events": [], "money": [], "urls": []
|
| 1613 |
-
})
|
| 1614 |
-
|
| 1615 |
-
for research in research_data:
|
| 1616 |
-
for result in research.get("results", []):
|
| 1617 |
-
text = f"{result.get('title', '')} {result.get('snippet', '')}"
|
| 1618 |
-
url = result.get("url", "")
|
| 1619 |
-
|
| 1620 |
-
for company in self.find_mentions(text):
|
| 1621 |
-
cd = company_data[company]
|
| 1622 |
-
cd["mentions"] += 1
|
| 1623 |
-
cd["snippets"].append(result.get("snippet", "")[:200])
|
| 1624 |
-
cd["urls"].append(url)
|
| 1625 |
-
cd["events"].extend(self.extract_events(text, company))
|
| 1626 |
-
cd["money"].extend(MONEY_PATTERN.findall(text))
|
| 1627 |
-
|
| 1628 |
-
# Build output
|
| 1629 |
-
competitors = []
|
| 1630 |
-
for company, info in COMPANY_DEFINITIONS.items():
|
| 1631 |
-
data = company_data[company]
|
| 1632 |
-
|
| 1633 |
-
status = info["status"] if info.get("verified") else self.detect_status(data["snippets"], info["status"])
|
| 1634 |
-
|
| 1635 |
-
competitors.append({
|
| 1636 |
-
"name": company,
|
| 1637 |
-
"country": info["country"],
|
| 1638 |
-
"product": info["product"],
|
| 1639 |
-
"status": status,
|
| 1640 |
-
"stock": self.extract_stock(data["snippets"]),
|
| 1641 |
-
"funding": self.extract_funding(data["money"]),
|
| 1642 |
-
"notes": data["snippets"][0] if data["snippets"] else "",
|
| 1643 |
-
"mentions": data["mentions"],
|
| 1644 |
-
"events": [{"date": e["date"], "event": e["context"][:100]} for e in data["events"][:10]],
|
| 1645 |
-
"sample_urls": list(set(data["urls"]))[:5],
|
| 1646 |
-
})
|
| 1647 |
-
|
| 1648 |
-
competitors.sort(key=lambda x: x["mentions"], reverse=True)
|
| 1649 |
-
|
| 1650 |
-
# Detect opportunities (from competitor status + Deep Intel findings)
|
| 1651 |
-
all_snippets = []
|
| 1652 |
-
for company, data in company_data.items():
|
| 1653 |
-
all_snippets.extend(data["snippets"])
|
| 1654 |
-
new_opportunity = self.detect_opportunities(competitors, all_snippets)
|
| 1655 |
-
|
| 1656 |
-
# Decide whether to update the widget
|
| 1657 |
-
existing = self.load_existing_data()
|
| 1658 |
-
existing_opp = existing.get("opportunity", {}) if existing else {}
|
| 1659 |
-
|
| 1660 |
-
if existing_opp.get("confirmed"):
|
| 1661 |
-
# Confirmed: only flag update if data materially changed
|
| 1662 |
-
if self._opportunity_changed(new_opportunity.get("raw_opportunities", []), existing_opp):
|
| 1663 |
-
opportunity = existing_opp
|
| 1664 |
-
opportunity["update_available"] = True
|
| 1665 |
-
opportunity["suggested_update"] = new_opportunity
|
| 1666 |
-
else:
|
| 1667 |
-
opportunity = existing_opp
|
| 1668 |
-
opportunity["update_available"] = False
|
| 1669 |
-
else:
|
| 1670 |
-
# Not confirmed: auto-update
|
| 1671 |
-
opportunity = new_opportunity
|
| 1672 |
-
opportunity["confirmed"] = False
|
| 1673 |
-
opportunity["update_available"] = False
|
| 1674 |
-
|
| 1675 |
-
return {
|
| 1676 |
-
"competitors": competitors,
|
| 1677 |
-
"market": {"size_2024": 2_000_000_000, "size_2029_ai": 9_100_000_000, "cagr": 0.278},
|
| 1678 |
-
"opportunity": opportunity,
|
| 1679 |
-
"_generated": datetime.now().isoformat(),
|
| 1680 |
-
"_source_files": [f.name for f in self.research_dir.glob("*.json") if not f.name.startswith(".")]
|
| 1681 |
-
}
|
| 1682 |
-
|
| 1683 |
-
def save(self, data: dict = None) -> Path:
|
| 1684 |
-
"""Process and save to output file."""
|
| 1685 |
-
data = data or self.process()
|
| 1686 |
-
self.output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 1687 |
-
with open(self.output_file, "w") as f:
|
| 1688 |
-
json.dump(data, f, indent=2)
|
| 1689 |
-
return self.output_file
|
| 1690 |
-
|
| 1691 |
-
|
| 1692 |
-
# ============================================================
|
| 1693 |
-
# CLI Commands
|
| 1694 |
-
# ============================================================
|
| 1695 |
-
|
| 1696 |
-
def cmd_extract(args):
|
| 1697 |
-
"""Extract competitor data from research."""
|
| 1698 |
-
extractor = CompetitorExtractor()
|
| 1699 |
-
|
| 1700 |
-
print(f"Loading research from: {extractor.research_dir}")
|
| 1701 |
-
data = extractor.process()
|
| 1702 |
-
|
| 1703 |
-
if not data["competitors"]:
|
| 1704 |
-
print("No research files found. Run 'batch' first.")
|
| 1705 |
-
return
|
| 1706 |
-
|
| 1707 |
-
output = extractor.save(data)
|
| 1708 |
-
|
| 1709 |
-
print(f"Saved to: {output}")
|
| 1710 |
-
print(f"\nCompany mentions:")
|
| 1711 |
-
for comp in data["competitors"]:
|
| 1712 |
-
status_marker = {"collapsed": "⚠", "weak": "↓", "growing": "↑", "strong": "★"}.get(comp["status"], "•")
|
| 1713 |
-
print(f" {status_marker} {comp['name']}: {comp['mentions']} mentions ({comp['status']})")
|
| 1714 |
-
|
| 1715 |
-
|
| 1716 |
-
def cmd_search(args):
|
| 1717 |
-
"""Single search command."""
|
| 1718 |
-
service = SearchService(backend=args.backend)
|
| 1719 |
-
print(f"Searching: {args.query}")
|
| 1720 |
-
print(f"Backend: {args.backend} | Max: {args.max_results}")
|
| 1721 |
-
print("-" * 50)
|
| 1722 |
-
|
| 1723 |
-
results = service.search(args.query, args.max_results, save=args.save)
|
| 1724 |
-
|
| 1725 |
-
for i, r in enumerate(results, 1):
|
| 1726 |
-
print(f"\n{i}. {r.title}")
|
| 1727 |
-
print(f" {r.url}")
|
| 1728 |
-
print(f" {r.snippet[:150]}...")
|
| 1729 |
-
|
| 1730 |
-
print(f"\n[{len(results)} results]")
|
| 1731 |
-
if args.save:
|
| 1732 |
-
print(f"Saved to: {RESEARCH_DIR}")
|
| 1733 |
-
|
| 1734 |
-
|
| 1735 |
-
def cmd_batch(args):
|
| 1736 |
-
"""Batch research command."""
|
| 1737 |
-
service = SearchService(backend=args.backend)
|
| 1738 |
-
storage = ResultStorage()
|
| 1739 |
-
|
| 1740 |
-
# Generate all queries
|
| 1741 |
-
queries = []
|
| 1742 |
-
for company in COMPETITORS:
|
| 1743 |
-
for template in BATCH_QUERY_TEMPLATES:
|
| 1744 |
-
queries.append(template.format(company=company))
|
| 1745 |
-
queries.extend(MARKET_QUERIES)
|
| 1746 |
-
|
| 1747 |
-
total_queries = len(queries)
|
| 1748 |
-
|
| 1749 |
-
# Deduplicate unless --force is set
|
| 1750 |
-
skipped = 0
|
| 1751 |
-
if not args.force:
|
| 1752 |
-
recent = storage.get_recent_queries(days=args.days)
|
| 1753 |
-
original_count = len(queries)
|
| 1754 |
-
queries = [q for q in queries if q.lower().strip() not in recent]
|
| 1755 |
-
skipped = original_count - len(queries)
|
| 1756 |
-
|
| 1757 |
-
print(f"Batch Research")
|
| 1758 |
-
print(f"{'='*60}")
|
| 1759 |
-
print(f"Competitors: {len(COMPETITORS)}")
|
| 1760 |
-
print(f"Total queries: {total_queries}")
|
| 1761 |
-
if skipped > 0:
|
| 1762 |
-
print(f"Skipped (run in last {args.days} days): {skipped}")
|
| 1763 |
-
print(f"New queries to run: {len(queries)}")
|
| 1764 |
-
print(f"Output: {RESEARCH_DIR}")
|
| 1765 |
-
print(f"{'='*60}")
|
| 1766 |
-
|
| 1767 |
-
if not queries:
|
| 1768 |
-
print("\nNo new queries to run. Use --force to re-run all.")
|
| 1769 |
-
return
|
| 1770 |
-
|
| 1771 |
-
def progress(i, total, query):
|
| 1772 |
-
print(f"\n[{i}/{total}] {query}")
|
| 1773 |
-
|
| 1774 |
-
stats = service.search_batch(queries, args.max_results, args.delay, callback=progress)
|
| 1775 |
-
|
| 1776 |
-
success = sum(1 for v in stats.values() if v >= 0)
|
| 1777 |
-
print(f"\n{'='*60}")
|
| 1778 |
-
print(f"Complete: {success}/{len(queries)} successful")
|
| 1779 |
-
if skipped > 0:
|
| 1780 |
-
print(f"Skipped: {skipped} (already run recently)")
|
| 1781 |
-
print(f"{'='*60}")
|
| 1782 |
-
|
| 1783 |
-
|
| 1784 |
-
def cmd_competitor(args):
|
| 1785 |
-
"""Deep competitive intelligence on a single competitor."""
|
| 1786 |
-
company = args.company
|
| 1787 |
-
use_external_llm = args.external_llm
|
| 1788 |
-
|
| 1789 |
-
if use_external_llm and not LLM_ENABLED:
|
| 1790 |
-
print("Warning: --external-llm requested but OPENROUTER_API_KEY not found. Skipping external LLM.")
|
| 1791 |
-
use_external_llm = False
|
| 1792 |
-
|
| 1793 |
-
categories = None
|
| 1794 |
-
if args.categories:
|
| 1795 |
-
categories = [c.strip() for c in args.categories.split(",")]
|
| 1796 |
-
valid = set(DEEP_INTEL_CATEGORIES.keys())
|
| 1797 |
-
invalid = [c for c in categories if c not in valid]
|
| 1798 |
-
if invalid:
|
| 1799 |
-
print(f"Invalid categories: {invalid}")
|
| 1800 |
-
print(f"Valid: {sorted(valid)}")
|
| 1801 |
-
return
|
| 1802 |
-
|
| 1803 |
-
if args.list_categories:
|
| 1804 |
-
print("Available categories:")
|
| 1805 |
-
for key, cat in DEEP_INTEL_CATEGORIES.items():
|
| 1806 |
-
q_count = len(cat["queries"])
|
| 1807 |
-
print(f" {key:30s} {cat['label']:30s} ({q_count} queries)")
|
| 1808 |
-
return
|
| 1809 |
-
|
| 1810 |
-
agent = CompetitorIntelAgent(company)
|
| 1811 |
-
report_path = agent.run(
|
| 1812 |
-
categories=categories,
|
| 1813 |
-
use_external_llm=use_external_llm,
|
| 1814 |
-
delay=args.delay,
|
| 1815 |
-
max_results=args.max_results,
|
| 1816 |
)
|
| 1817 |
-
|
| 1818 |
-
|
| 1819 |
-
|
| 1820 |
-
|
| 1821 |
-
def cmd_sota(args):
|
| 1822 |
-
"""SOTA technology knowledge base."""
|
| 1823 |
-
try:
|
| 1824 |
-
from .sota_agent import SOTAScoutAgent
|
| 1825 |
-
except ImportError:
|
| 1826 |
-
from sota_agent import SOTAScoutAgent
|
| 1827 |
-
|
| 1828 |
-
agent = SOTAScoutAgent()
|
| 1829 |
-
|
| 1830 |
-
if args.analyze:
|
| 1831 |
-
report = agent.analyze(args.analyze)
|
| 1832 |
-
print(f"\nAnalysis report: {report}")
|
| 1833 |
-
return
|
| 1834 |
-
|
| 1835 |
-
# Default: show knowledge base
|
| 1836 |
-
agent.show(section=args.show)
|
| 1837 |
-
|
| 1838 |
-
|
| 1839 |
-
def cmd_list(args):
|
| 1840 |
-
"""List saved searches."""
|
| 1841 |
-
storage = ResultStorage()
|
| 1842 |
-
searches = storage.list_searches(args.limit)
|
| 1843 |
-
|
| 1844 |
-
if not searches:
|
| 1845 |
-
print(f"No searches in {RESEARCH_DIR}")
|
| 1846 |
-
return
|
| 1847 |
-
|
| 1848 |
-
print(f"Recent searches ({RESEARCH_DIR}):\n")
|
| 1849 |
-
for s in searches:
|
| 1850 |
-
print(f" {s['timestamp'][:10]} {s['results']:2d} results {s['query'][:50]}")
|
| 1851 |
-
|
| 1852 |
-
|
| 1853 |
-
def main():
|
| 1854 |
-
parser = argparse.ArgumentParser(
|
| 1855 |
-
description="GURMA.ai Research Tool",
|
| 1856 |
-
formatter_class=argparse.RawDescriptionHelpFormatter
|
| 1857 |
)
|
| 1858 |
-
|
| 1859 |
-
|
| 1860 |
-
|
| 1861 |
-
|
| 1862 |
-
|
| 1863 |
-
|
| 1864 |
-
|
| 1865 |
-
|
| 1866 |
-
|
| 1867 |
-
|
| 1868 |
-
|
| 1869 |
-
|
| 1870 |
-
|
| 1871 |
-
|
| 1872 |
-
|
| 1873 |
-
|
| 1874 |
-
|
| 1875 |
-
|
| 1876 |
-
|
| 1877 |
-
|
| 1878 |
-
|
| 1879 |
-
|
| 1880 |
-
|
| 1881 |
-
|
| 1882 |
-
|
| 1883 |
-
|
| 1884 |
-
|
| 1885 |
-
|
| 1886 |
-
|
| 1887 |
-
|
| 1888 |
-
|
| 1889 |
-
|
| 1890 |
-
|
| 1891 |
-
|
| 1892 |
-
|
| 1893 |
-
|
| 1894 |
-
# sota
|
| 1895 |
-
p_sota = subparsers.add_parser("sota", help="SOTA technology knowledge base for GURMA.ai")
|
| 1896 |
-
p_sota.add_argument("--analyze", "-a", type=str, default=None,
|
| 1897 |
-
help="Analyze a document and update knowledge base")
|
| 1898 |
-
p_sota.add_argument("--show", "-s", type=str, default=None, nargs="?",
|
| 1899 |
-
const=None,
|
| 1900 |
-
choices=["models", "techniques", "stack", "principles", "actions", "sources"],
|
| 1901 |
-
help="Show specific KB section (default: summary)")
|
| 1902 |
-
p_sota.set_defaults(func=cmd_sota)
|
| 1903 |
-
|
| 1904 |
-
# extract
|
| 1905 |
-
p_extract = subparsers.add_parser("extract", help="Extract competitor data to JSON")
|
| 1906 |
-
p_extract.set_defaults(func=cmd_extract)
|
| 1907 |
-
|
| 1908 |
-
# list
|
| 1909 |
-
p_list = subparsers.add_parser("list", help="List saved searches")
|
| 1910 |
-
p_list.add_argument("-l", "--limit", type=int, default=20)
|
| 1911 |
-
p_list.set_defaults(func=cmd_list)
|
| 1912 |
-
|
| 1913 |
-
args = parser.parse_args()
|
| 1914 |
-
|
| 1915 |
-
if hasattr(args, "func"):
|
| 1916 |
-
args.func(args)
|
| 1917 |
-
else:
|
| 1918 |
-
parser.print_help()
|
| 1919 |
-
|
| 1920 |
|
| 1921 |
if __name__ == "__main__":
|
| 1922 |
main()
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
GURMA.ai Research Tool — backwards-compatible shim.
|
| 4 |
+
|
| 5 |
+
All logic has been split into focused modules:
|
| 6 |
+
config.py — paths, API keys, constants
|
| 7 |
+
search.py — backends, storage, SearchService
|
| 8 |
+
llm.py — LLMClient (OpenRouter)
|
| 9 |
+
intel.py — CompetitorIntelAgent
|
| 10 |
+
extract.py — CompetitorExtractor, COMPANY_DEFINITIONS
|
| 11 |
+
cli.py — CLI commands and argparse
|
| 12 |
+
sota_agent.py — SOTA knowledge base agent
|
| 13 |
+
tr_agents.py — Turkish research agents
|
| 14 |
+
|
| 15 |
+
This file re-exports everything so existing imports work unchanged:
|
| 16 |
+
from research import SearchService, CompetitorExtractor, ...
|
| 17 |
+
python research.py batch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"""
|
| 19 |
|
| 20 |
+
try:
|
| 21 |
+
# Package context (src/utils/)
|
| 22 |
+
from .config import (
|
| 23 |
+
PROJECT_ROOT, IS_HF_SPACE, RESEARCH_DIR, DATA_DIR,
|
| 24 |
+
SERPAPI_KEY, BRAVE_API_KEY, OPENROUTER_API_KEY,
|
| 25 |
+
LLM_MODEL, LLM_ENABLED,
|
| 26 |
+
COMPETITORS, BATCH_QUERY_TEMPLATES, MARKET_QUERIES,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
)
|
| 28 |
+
from .search import (
|
| 29 |
+
WebSearchResult, SearchBackend, DuckDuckGoBackend,
|
| 30 |
+
SerpAPIBackend, BraveBackend, BACKENDS, get_backend,
|
| 31 |
+
ResultStorage, SearchService,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
)
|
| 33 |
+
from .llm import LLMClient
|
| 34 |
+
from .intel import (
|
| 35 |
+
IntelSection, CompetitorIntelAgent,
|
| 36 |
+
DEEP_INTEL_CATEGORIES, PRIMARY_SOURCE_DOMAINS,
|
| 37 |
+
CATEGORY_EXPECTED, CATEGORY_SYNTHESIS_QUESTIONS,
|
| 38 |
+
)
|
| 39 |
+
from .extract import (
|
| 40 |
+
CompetitorExtractor, COMPANY_DEFINITIONS,
|
| 41 |
+
STATUS_KEYWORDS, DATE_PATTERN, MONEY_PATTERN,
|
| 42 |
+
)
|
| 43 |
+
from .cli import main
|
| 44 |
+
except ImportError:
|
| 45 |
+
# Flat-file context (HF Space: all .py files in same directory)
|
| 46 |
+
from config import ( # type: ignore[no-redef]
|
| 47 |
+
PROJECT_ROOT, IS_HF_SPACE, RESEARCH_DIR, DATA_DIR,
|
| 48 |
+
SERPAPI_KEY, BRAVE_API_KEY, OPENROUTER_API_KEY,
|
| 49 |
+
LLM_MODEL, LLM_ENABLED,
|
| 50 |
+
COMPETITORS, BATCH_QUERY_TEMPLATES, MARKET_QUERIES,
|
| 51 |
+
)
|
| 52 |
+
from search import ( # type: ignore[no-redef]
|
| 53 |
+
WebSearchResult, SearchBackend, DuckDuckGoBackend,
|
| 54 |
+
SerpAPIBackend, BraveBackend, BACKENDS, get_backend,
|
| 55 |
+
ResultStorage, SearchService,
|
| 56 |
+
)
|
| 57 |
+
from llm import LLMClient # type: ignore[no-redef]
|
| 58 |
+
from intel import ( # type: ignore[no-redef]
|
| 59 |
+
IntelSection, CompetitorIntelAgent,
|
| 60 |
+
DEEP_INTEL_CATEGORIES, PRIMARY_SOURCE_DOMAINS,
|
| 61 |
+
CATEGORY_EXPECTED, CATEGORY_SYNTHESIS_QUESTIONS,
|
| 62 |
+
)
|
| 63 |
+
from extract import ( # type: ignore[no-redef]
|
| 64 |
+
CompetitorExtractor, COMPANY_DEFINITIONS,
|
| 65 |
+
STATUS_KEYWORDS, DATE_PATTERN, MONEY_PATTERN,
|
| 66 |
+
)
|
| 67 |
+
from cli import main # type: ignore[no-redef]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
if __name__ == "__main__":
|
| 70 |
main()
|
search.py
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Search backends, result storage, and the SearchService facade.
|
| 3 |
+
|
| 4 |
+
Provides multi-backend web search (DuckDuckGo, SerpAPI, Brave),
|
| 5 |
+
result persistence (JSON + Markdown), and a unified SearchService.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import sys
|
| 12 |
+
import time
|
| 13 |
+
from abc import ABC, abstractmethod
|
| 14 |
+
from dataclasses import dataclass, asdict
|
| 15 |
+
from datetime import datetime, timedelta
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Protocol
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
from .config import RESEARCH_DIR, SERPAPI_KEY, BRAVE_API_KEY
|
| 21 |
+
except ImportError:
|
| 22 |
+
from config import RESEARCH_DIR, SERPAPI_KEY, BRAVE_API_KEY
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ============================================================
|
| 26 |
+
# Data Types
|
| 27 |
+
# ============================================================
|
| 28 |
+
|
| 29 |
+
class SearchResult(Protocol):
|
| 30 |
+
title: str
|
| 31 |
+
url: str
|
| 32 |
+
snippet: str
|
| 33 |
+
source: str
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@dataclass
|
| 37 |
+
class WebSearchResult:
|
| 38 |
+
title: str
|
| 39 |
+
url: str
|
| 40 |
+
snippet: str
|
| 41 |
+
source: str
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ============================================================
|
| 45 |
+
# Search Backends
|
| 46 |
+
# ============================================================
|
| 47 |
+
|
| 48 |
+
class SearchBackend(ABC):
|
| 49 |
+
"""Abstract base for search backends."""
|
| 50 |
+
|
| 51 |
+
@property
|
| 52 |
+
@abstractmethod
|
| 53 |
+
def name(self) -> str:
|
| 54 |
+
pass
|
| 55 |
+
|
| 56 |
+
@abstractmethod
|
| 57 |
+
def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
|
| 58 |
+
pass
|
| 59 |
+
|
| 60 |
+
@abstractmethod
|
| 61 |
+
def is_available(self) -> bool:
|
| 62 |
+
pass
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class DuckDuckGoBackend(SearchBackend):
|
| 66 |
+
|
| 67 |
+
@property
|
| 68 |
+
def name(self) -> str:
|
| 69 |
+
return "duckduckgo"
|
| 70 |
+
|
| 71 |
+
def is_available(self) -> bool:
|
| 72 |
+
try:
|
| 73 |
+
from ddgs import DDGS
|
| 74 |
+
return True
|
| 75 |
+
except ImportError:
|
| 76 |
+
try:
|
| 77 |
+
from duckduckgo_search import DDGS
|
| 78 |
+
return True
|
| 79 |
+
except ImportError:
|
| 80 |
+
return False
|
| 81 |
+
|
| 82 |
+
def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
|
| 83 |
+
try:
|
| 84 |
+
from ddgs import DDGS
|
| 85 |
+
except ImportError:
|
| 86 |
+
from duckduckgo_search import DDGS
|
| 87 |
+
|
| 88 |
+
results = []
|
| 89 |
+
ddgs = DDGS()
|
| 90 |
+
for r in ddgs.text(query, max_results=max_results):
|
| 91 |
+
results.append(WebSearchResult(
|
| 92 |
+
title=r.get("title", ""),
|
| 93 |
+
url=r.get("href", r.get("link", "")),
|
| 94 |
+
snippet=r.get("body", r.get("snippet", "")),
|
| 95 |
+
source=self.name
|
| 96 |
+
))
|
| 97 |
+
return results
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class SerpAPIBackend(SearchBackend):
|
| 101 |
+
|
| 102 |
+
@property
|
| 103 |
+
def name(self) -> str:
|
| 104 |
+
return "serpapi"
|
| 105 |
+
|
| 106 |
+
def is_available(self) -> bool:
|
| 107 |
+
try:
|
| 108 |
+
import requests
|
| 109 |
+
return bool(SERPAPI_KEY)
|
| 110 |
+
except ImportError:
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
|
| 114 |
+
import requests
|
| 115 |
+
|
| 116 |
+
response = requests.get(
|
| 117 |
+
"https://serpapi.com/search",
|
| 118 |
+
params={"q": query, "api_key": SERPAPI_KEY, "engine": "google", "num": max_results},
|
| 119 |
+
timeout=30
|
| 120 |
+
)
|
| 121 |
+
response.raise_for_status()
|
| 122 |
+
data = response.json()
|
| 123 |
+
|
| 124 |
+
results = []
|
| 125 |
+
for r in data.get("organic_results", [])[:max_results]:
|
| 126 |
+
results.append(WebSearchResult(
|
| 127 |
+
title=r.get("title", ""),
|
| 128 |
+
url=r.get("link", ""),
|
| 129 |
+
snippet=r.get("snippet", ""),
|
| 130 |
+
source=self.name
|
| 131 |
+
))
|
| 132 |
+
return results
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class BraveBackend(SearchBackend):
|
| 136 |
+
|
| 137 |
+
@property
|
| 138 |
+
def name(self) -> str:
|
| 139 |
+
return "brave"
|
| 140 |
+
|
| 141 |
+
def is_available(self) -> bool:
|
| 142 |
+
try:
|
| 143 |
+
import requests
|
| 144 |
+
return bool(BRAVE_API_KEY)
|
| 145 |
+
except ImportError:
|
| 146 |
+
return False
|
| 147 |
+
|
| 148 |
+
def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
|
| 149 |
+
import requests
|
| 150 |
+
|
| 151 |
+
response = requests.get(
|
| 152 |
+
"https://api.search.brave.com/res/v1/web/search",
|
| 153 |
+
headers={"Accept": "application/json", "X-Subscription-Token": BRAVE_API_KEY},
|
| 154 |
+
params={"q": query, "count": min(max_results, 20)},
|
| 155 |
+
timeout=30
|
| 156 |
+
)
|
| 157 |
+
response.raise_for_status()
|
| 158 |
+
data = response.json()
|
| 159 |
+
|
| 160 |
+
results = []
|
| 161 |
+
for r in data.get("web", {}).get("results", [])[:max_results]:
|
| 162 |
+
results.append(WebSearchResult(
|
| 163 |
+
title=r.get("title", ""),
|
| 164 |
+
url=r.get("url", ""),
|
| 165 |
+
snippet=r.get("description", ""),
|
| 166 |
+
source=self.name
|
| 167 |
+
))
|
| 168 |
+
return results
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# Backend registry
|
| 172 |
+
BACKENDS: dict[str, SearchBackend] = {
|
| 173 |
+
"duckduckgo": DuckDuckGoBackend(),
|
| 174 |
+
"ddg": DuckDuckGoBackend(),
|
| 175 |
+
"serpapi": SerpAPIBackend(),
|
| 176 |
+
"brave": BraveBackend(),
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def get_backend(name: str = "duckduckgo") -> SearchBackend:
|
| 181 |
+
backend = BACKENDS.get(name)
|
| 182 |
+
if not backend:
|
| 183 |
+
raise ValueError(f"Unknown backend: {name}. Available: {list(BACKENDS.keys())}")
|
| 184 |
+
if not backend.is_available():
|
| 185 |
+
raise RuntimeError(f"Backend '{name}' not available. Check dependencies/API keys.")
|
| 186 |
+
return backend
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
# ============================================================
|
| 190 |
+
# Result Storage
|
| 191 |
+
# ============================================================
|
| 192 |
+
|
| 193 |
+
class ResultStorage:
|
| 194 |
+
|
| 195 |
+
def __init__(self, directory: Path = RESEARCH_DIR):
|
| 196 |
+
self.directory = directory
|
| 197 |
+
self.directory.mkdir(parents=True, exist_ok=True)
|
| 198 |
+
|
| 199 |
+
def save(self, query: str, results: list[WebSearchResult], backend: str) -> tuple[Path, Path]:
|
| 200 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 201 |
+
slug = self._slugify(query)
|
| 202 |
+
base_name = f"{timestamp}_{slug}"
|
| 203 |
+
|
| 204 |
+
data = {
|
| 205 |
+
"query": query,
|
| 206 |
+
"timestamp": datetime.now().isoformat(),
|
| 207 |
+
"backend": backend,
|
| 208 |
+
"result_count": len(results),
|
| 209 |
+
"results": [asdict(r) for r in results]
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
json_path = self.directory / f"{base_name}.json"
|
| 213 |
+
with open(json_path, "w") as f:
|
| 214 |
+
json.dump(data, f, indent=2)
|
| 215 |
+
|
| 216 |
+
md_path = self.directory / f"{base_name}.md"
|
| 217 |
+
with open(md_path, "w") as f:
|
| 218 |
+
f.write(f"# Search: {query}\n\n")
|
| 219 |
+
f.write(f"**Date:** {data['timestamp']} \n")
|
| 220 |
+
f.write(f"**Backend:** {backend} \n")
|
| 221 |
+
f.write(f"**Results:** {len(results)}\n\n---\n")
|
| 222 |
+
for i, r in enumerate(results, 1):
|
| 223 |
+
f.write(f"\n## {i}. {r.title}\n\n**URL:** {r.url}\n\n{r.snippet}\n")
|
| 224 |
+
|
| 225 |
+
return json_path, md_path
|
| 226 |
+
|
| 227 |
+
def list_searches(self, limit: int = 20) -> list[dict]:
|
| 228 |
+
searches = []
|
| 229 |
+
for json_file in sorted(self.directory.glob("*.json"), reverse=True):
|
| 230 |
+
if json_file.name.startswith("."):
|
| 231 |
+
continue
|
| 232 |
+
try:
|
| 233 |
+
with open(json_file) as f:
|
| 234 |
+
data = json.load(f)
|
| 235 |
+
searches.append({
|
| 236 |
+
"file": json_file.name,
|
| 237 |
+
"query": data.get("query", ""),
|
| 238 |
+
"timestamp": data.get("timestamp", ""),
|
| 239 |
+
"results": data.get("result_count", 0)
|
| 240 |
+
})
|
| 241 |
+
except:
|
| 242 |
+
pass
|
| 243 |
+
if len(searches) >= limit:
|
| 244 |
+
break
|
| 245 |
+
return searches
|
| 246 |
+
|
| 247 |
+
def get_recent_queries(self, days: int = 7) -> set[str]:
|
| 248 |
+
cutoff = datetime.now() - timedelta(days=days)
|
| 249 |
+
recent = set()
|
| 250 |
+
|
| 251 |
+
for json_file in self.directory.glob("*.json"):
|
| 252 |
+
if json_file.name.startswith("."):
|
| 253 |
+
continue
|
| 254 |
+
try:
|
| 255 |
+
with open(json_file) as f:
|
| 256 |
+
data = json.load(f)
|
| 257 |
+
ts = data.get("timestamp", "")
|
| 258 |
+
if ts:
|
| 259 |
+
file_date = datetime.fromisoformat(ts.replace("Z", "+00:00").split("+")[0])
|
| 260 |
+
if file_date >= cutoff:
|
| 261 |
+
query = data.get("query", "").lower().strip()
|
| 262 |
+
recent.add(query)
|
| 263 |
+
except:
|
| 264 |
+
pass
|
| 265 |
+
return recent
|
| 266 |
+
|
| 267 |
+
def _slugify(self, text: str, max_len: int = 50) -> str:
|
| 268 |
+
slug = text.lower()[:max_len].replace(" ", "-").replace("/", "-")
|
| 269 |
+
return "".join(c for c in slug if c.isalnum() or c == "-")
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
# ============================================================
|
| 273 |
+
# Search Service (Facade)
|
| 274 |
+
# ============================================================
|
| 275 |
+
|
| 276 |
+
class SearchService:
|
| 277 |
+
"""High-level search interface combining backend and storage."""
|
| 278 |
+
|
| 279 |
+
def __init__(self, backend: str = "duckduckgo", storage: ResultStorage = None):
|
| 280 |
+
self.backend = get_backend(backend)
|
| 281 |
+
self.storage = storage or ResultStorage()
|
| 282 |
+
|
| 283 |
+
def search(self, query: str, max_results: int = 10, save: bool = True) -> list[WebSearchResult]:
|
| 284 |
+
results = self.backend.search(query, max_results)
|
| 285 |
+
if save and results:
|
| 286 |
+
self.storage.save(query, results, self.backend.name)
|
| 287 |
+
return results
|
| 288 |
+
|
| 289 |
+
def search_batch(self, queries: list[str], max_results: int = 10,
|
| 290 |
+
delay: float = 0.5, callback=None) -> dict[str, int]:
|
| 291 |
+
stats = {}
|
| 292 |
+
for i, query in enumerate(queries, 1):
|
| 293 |
+
if callback:
|
| 294 |
+
callback(i, len(queries), query)
|
| 295 |
+
try:
|
| 296 |
+
results = self.search(query, max_results, save=True)
|
| 297 |
+
stats[query] = len(results)
|
| 298 |
+
except Exception as e:
|
| 299 |
+
stats[query] = -1
|
| 300 |
+
print(f"Error on '{query}': {e}", file=sys.stderr)
|
| 301 |
+
|
| 302 |
+
if delay > 0 and i < len(queries):
|
| 303 |
+
time.sleep(delay)
|
| 304 |
+
|
| 305 |
+
return stats
|
sota_agent.py
ADDED
|
@@ -0,0 +1,850 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
GURMA.ai SOTA Technology Knowledge Agent
|
| 4 |
+
|
| 5 |
+
Maintains a persistent knowledge base of state-of-the-art models, techniques,
|
| 6 |
+
and tools relevant to GURMA.ai's high-precision medical/rehabilitation AI domain.
|
| 7 |
+
|
| 8 |
+
The agent is "aware" of GURMA.ai's strategic position (outcome data moat,
|
| 9 |
+
edge-first, safety-critical) and filters all technology developments through
|
| 10 |
+
that lens. It updates itself when fed new information (papers, podcasts,
|
| 11 |
+
announcements).
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
python research.py sota # Show current knowledge base
|
| 15 |
+
python research.py sota --analyze notes/research/podcast-sota-models.md
|
| 16 |
+
python research.py sota --show models # Show tracked models
|
| 17 |
+
python research.py sota --show techniques # Show tracked techniques
|
| 18 |
+
python research.py sota --show stack # Show recommended tech stack
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import json
|
| 24 |
+
import re
|
| 25 |
+
from dataclasses import dataclass, field, asdict
|
| 26 |
+
from datetime import datetime
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
from typing import Optional
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
from .llm import LLMClient
|
| 32 |
+
from .config import RESEARCH_DIR, LLM_ENABLED
|
| 33 |
+
except ImportError:
|
| 34 |
+
from llm import LLMClient
|
| 35 |
+
from config import RESEARCH_DIR, LLM_ENABLED
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ============================================================
|
| 39 |
+
# GURMA.ai Context — what the agent "knows" about the company
|
| 40 |
+
# ============================================================
|
| 41 |
+
|
| 42 |
+
GURMA_CONTEXT = {
|
| 43 |
+
"company": "GURMA.ai AG (Swiss)",
|
| 44 |
+
"domain": "Rehabilitation robotics AI — high-precision medical domain",
|
| 45 |
+
"data_moat": "15 years of patient outcome data from BAMA Teknoloji "
|
| 46 |
+
"(gait dynamics, EMG signals, recovery outcomes — not just motion data)",
|
| 47 |
+
"products": ["RoboGate (stationary gait rehab robot)", "FreeGate (5-axis mobile exoskeleton)"],
|
| 48 |
+
"architecture": "Privacy-first edge computing — no cloud data exposure",
|
| 49 |
+
"regulatory": "EU AI Act (high-risk), MDR, ISO 13485, GDPR/KVKK — 80% safety focus from day one",
|
| 50 |
+
"precision_requirement": (
|
| 51 |
+
"Medical rehabilitation demands super-high precision: wrong therapy parameters "
|
| 52 |
+
"can harm patients. Models must be verifiable, explainable, and fail-safe. "
|
| 53 |
+
"This is NOT a domain where 'good enough' works — it requires domain-specific "
|
| 54 |
+
"training on real outcome data with verifiable reward signals."
|
| 55 |
+
),
|
| 56 |
+
"core_thesis": (
|
| 57 |
+
"Proprietary outcome data + domain expertise + regulatory focus = defensible AI moat. "
|
| 58 |
+
"Frontier labs are NOT focusing on domain-specific medical applications."
|
| 59 |
+
),
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
# ============================================================
|
| 63 |
+
# Relevance Scoring — how the agent filters incoming info
|
| 64 |
+
# ============================================================
|
| 65 |
+
|
| 66 |
+
GURMA_RELEVANCE_SIGNALS = {
|
| 67 |
+
"outcome_data": {
|
| 68 |
+
"weight": 3,
|
| 69 |
+
"description": "Patient outcomes as training signal / verifiable rewards",
|
| 70 |
+
"patterns": [
|
| 71 |
+
r"outcome.?data", r"patient.?outcome", r"recovery.?outcome",
|
| 72 |
+
r"treatment.?outcome", r"verifiable.?reward", r"reward.?model",
|
| 73 |
+
r"clinical.?outcome", r"reward.?signal",
|
| 74 |
+
],
|
| 75 |
+
},
|
| 76 |
+
"rehabilitation": {
|
| 77 |
+
"weight": 3,
|
| 78 |
+
"description": "Rehabilitation, gait, exoskeleton, motor recovery tech",
|
| 79 |
+
"patterns": [
|
| 80 |
+
r"rehabilitat", r"gait.?(?:analysis|dynamic|training)",
|
| 81 |
+
r"exoskeleton", r"physical.?therapy", r"motor.?recovery",
|
| 82 |
+
r"neurorehab", r"stroke.?recovery",
|
| 83 |
+
],
|
| 84 |
+
},
|
| 85 |
+
"high_precision": {
|
| 86 |
+
"weight": 3,
|
| 87 |
+
"description": "High-precision / safety-critical model requirements",
|
| 88 |
+
"patterns": [
|
| 89 |
+
r"high.?precision", r"safety.?critical", r"fail.?(?:safe|never)",
|
| 90 |
+
r"verification.?layer", r"verifiable", r"explainabl",
|
| 91 |
+
r"clinical.?(?:accuracy|precision|validation)",
|
| 92 |
+
],
|
| 93 |
+
},
|
| 94 |
+
"domain_specific": {
|
| 95 |
+
"weight": 2,
|
| 96 |
+
"description": "Domain-specific fine-tuning — GURMA.ai's core approach",
|
| 97 |
+
"patterns": [
|
| 98 |
+
r"domain.specific", r"fine.tun", r"\bLoRA\b", r"specialized.?model",
|
| 99 |
+
r"medical.?(?:model|AI|LLM)", r"clinical.?(?:NLP|model)",
|
| 100 |
+
r"proprietary.?data",
|
| 101 |
+
],
|
| 102 |
+
},
|
| 103 |
+
"rl_training": {
|
| 104 |
+
"weight": 2,
|
| 105 |
+
"description": "RL post-training — outcome data as rewards",
|
| 106 |
+
"patterns": [
|
| 107 |
+
r"\bRLHF\b", r"\bRLVR\b", r"reinforcement.?learning",
|
| 108 |
+
r"post.training", r"\bPPO\b", r"\bGRPO\b", r"reward.?shaping",
|
| 109 |
+
],
|
| 110 |
+
},
|
| 111 |
+
"edge_privacy": {
|
| 112 |
+
"weight": 2,
|
| 113 |
+
"description": "Edge inference, on-device, privacy-first deployment",
|
| 114 |
+
"patterns": [
|
| 115 |
+
r"edge.?(?:computing|deploy|inference|device)",
|
| 116 |
+
r"on.device", r"privacy.first", r"local.?inference",
|
| 117 |
+
r"quantiz", r"on.premise",
|
| 118 |
+
],
|
| 119 |
+
},
|
| 120 |
+
"safety_regulatory": {
|
| 121 |
+
"weight": 2,
|
| 122 |
+
"description": "AI safety, medical device regulation, verification",
|
| 123 |
+
"patterns": [
|
| 124 |
+
r"AI.?safety", r"medical.?device", r"(?:EU|FDA).?(?:AI|regulat)",
|
| 125 |
+
r"\bMDR\b", r"CE.?mark", r"ISO.?13485", r"verification.?layer",
|
| 126 |
+
r"constitutional.?AI",
|
| 127 |
+
],
|
| 128 |
+
},
|
| 129 |
+
"robotics": {
|
| 130 |
+
"weight": 2,
|
| 131 |
+
"description": "Robotics AI, world models, embodied AI, patient simulation",
|
| 132 |
+
"patterns": [
|
| 133 |
+
r"robot(?:ic)?s?.?(?:AI|control|learning)",
|
| 134 |
+
r"world.?model", r"embodied.?AI", r"simulation.?model",
|
| 135 |
+
r"continual.?learning",
|
| 136 |
+
],
|
| 137 |
+
},
|
| 138 |
+
"open_models": {
|
| 139 |
+
"weight": 1,
|
| 140 |
+
"description": "Open-weight models with clear licensing for medical use",
|
| 141 |
+
"patterns": [
|
| 142 |
+
r"open.weight", r"open.source.?(?:model|LLM)",
|
| 143 |
+
r"\bQwen\b", r"\bOLMo\b", r"DeepSeek", r"\bLlama\b",
|
| 144 |
+
r"\bMistral\b", r"gpt.oss",
|
| 145 |
+
],
|
| 146 |
+
},
|
| 147 |
+
"tool_use": {
|
| 148 |
+
"weight": 1,
|
| 149 |
+
"description": "Tool-calling AI for clinical workflow automation",
|
| 150 |
+
"patterns": [
|
| 151 |
+
r"tool.?(?:use|calling)", r"function.?call",
|
| 152 |
+
r"(?:AI|LLM).?agent", r"autonomous.?agent",
|
| 153 |
+
],
|
| 154 |
+
},
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
# ============================================================
|
| 158 |
+
# Initial Knowledge Base — seeded from podcast analysis
|
| 159 |
+
# ============================================================
|
| 160 |
+
|
| 161 |
+
INITIAL_KNOWLEDGE_BASE = {
|
| 162 |
+
"models": [
|
| 163 |
+
{
|
| 164 |
+
"name": "Qwen 3",
|
| 165 |
+
"params": "7B-32B range",
|
| 166 |
+
"why": "Best open-weight performance (50T tokens trained), friendly commercial license",
|
| 167 |
+
"gurma_fit": "Base model for domain fine-tuning; fewer restrictions than Llama for medical device use",
|
| 168 |
+
"status": "recommended",
|
| 169 |
+
"added": "2026-02-06",
|
| 170 |
+
"source": "Lex Fridman Podcast #490",
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"name": "OLMo 3",
|
| 174 |
+
"params": "7B+",
|
| 175 |
+
"why": "Fully documented training process, truly open (AI2), great for understanding methodology",
|
| 176 |
+
"gurma_fit": "Best for learning/reproducing training; full transparency aids regulatory documentation",
|
| 177 |
+
"status": "recommended",
|
| 178 |
+
"added": "2026-02-06",
|
| 179 |
+
"source": "Lex Fridman Podcast #490",
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"name": "gpt-oss-120b",
|
| 183 |
+
"params": "120B",
|
| 184 |
+
"why": "First open model specifically trained with tool use in mind",
|
| 185 |
+
"gurma_fit": "Tool-calling for patient data APIs, sensor queries, automated insurance reporting",
|
| 186 |
+
"status": "watch",
|
| 187 |
+
"added": "2026-02-06",
|
| 188 |
+
"source": "Lex Fridman Podcast #490",
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"name": "DeepSeek-V3.2",
|
| 192 |
+
"params": "varies",
|
| 193 |
+
"why": "Sparse attention architecture, efficient inference",
|
| 194 |
+
"gurma_fit": "Sparse attention promising for edge deployment on RoboGate/FreeGate",
|
| 195 |
+
"status": "watch",
|
| 196 |
+
"added": "2026-02-06",
|
| 197 |
+
"source": "Lex Fridman Podcast #490",
|
| 198 |
+
},
|
| 199 |
+
],
|
| 200 |
+
"techniques": [
|
| 201 |
+
{
|
| 202 |
+
"name": "RLVR (Reinforcement Learning with Verifiable Rewards)",
|
| 203 |
+
"category": "post-training",
|
| 204 |
+
"why": "Post-training is the 'skill unlock' — pre-training gives knowledge, post-training gives precision",
|
| 205 |
+
"gurma_fit": "Patient recovery outcomes ARE verifiable rewards. 15 years of outcome data = perfect RLVR signal.",
|
| 206 |
+
"priority": "high",
|
| 207 |
+
"added": "2026-02-06",
|
| 208 |
+
"source": "Lex Fridman Podcast #490",
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"name": "LoRA (Low-Rank Adaptation)",
|
| 212 |
+
"category": "fine-tuning",
|
| 213 |
+
"why": "Fine-tune only a small subset of weights; practical on limited compute; proven on 7B models",
|
| 214 |
+
"gurma_fit": "Start with 7B models + LoRA for engagement scoring and outcome prediction. Efficient enough for iterative experiments.",
|
| 215 |
+
"priority": "high",
|
| 216 |
+
"added": "2026-02-06",
|
| 217 |
+
"source": "Lex Fridman Podcast #490",
|
| 218 |
+
},
|
| 219 |
+
{
|
| 220 |
+
"name": "PPO / GRPO",
|
| 221 |
+
"category": "post-training",
|
| 222 |
+
"why": "Policy gradient algorithms for RL post-training; PPO is standard, GRPO is newer group-relative approach",
|
| 223 |
+
"gurma_fit": "Algorithms to train models using outcome data as reward signal",
|
| 224 |
+
"priority": "medium",
|
| 225 |
+
"added": "2026-02-06",
|
| 226 |
+
"source": "Lex Fridman Podcast #490",
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"name": "Synthetic Data (reformatting)",
|
| 230 |
+
"category": "data-processing",
|
| 231 |
+
"why": "Not 'AI-generated fake data' — means restructuring real data into training formats (Q&A, summaries). OCR for medical PDFs.",
|
| 232 |
+
"gurma_fit": "Convert EMG readings → Q&A format, session notes → case summaries, treatment logs → outcome predictions",
|
| 233 |
+
"priority": "high",
|
| 234 |
+
"added": "2026-02-06",
|
| 235 |
+
"source": "Lex Fridman Podcast #490",
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"name": "World Models",
|
| 239 |
+
"category": "simulation",
|
| 240 |
+
"why": "Model runs a simulation of the environment; verifies intermediate states, not just final results",
|
| 241 |
+
"gurma_fit": "Patient progress modeling IS a world model problem. Verify intermediate therapy states, simulate treatment outcomes.",
|
| 242 |
+
"priority": "medium",
|
| 243 |
+
"added": "2026-02-06",
|
| 244 |
+
"source": "Lex Fridman Podcast #490",
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"name": "Sparse Attention",
|
| 248 |
+
"category": "efficiency",
|
| 249 |
+
"why": "Lightweight token selection indexer; efficient inference for edge deployment",
|
| 250 |
+
"gurma_fit": "Could enable on-device models for RoboGate/FreeGate with privacy-first architecture",
|
| 251 |
+
"priority": "medium",
|
| 252 |
+
"added": "2026-02-06",
|
| 253 |
+
"source": "Lex Fridman Podcast #490",
|
| 254 |
+
},
|
| 255 |
+
],
|
| 256 |
+
"tech_stack": [
|
| 257 |
+
{"component": "Base Model", "recommendation": "Qwen 3 (7B-32B) or OLMo 3", "rationale": "Open weights, good license, well-documented"},
|
| 258 |
+
{"component": "Fine-tuning", "recommendation": "LoRA + RLVR", "rationale": "Practical compute, outcome-based rewards"},
|
| 259 |
+
{"component": "Tool Use Model", "recommendation": "gpt-oss-120b", "rationale": "Specifically trained for tool calling"},
|
| 260 |
+
{"component": "Training Framework", "recommendation": "TRL (Hugging Face)", "rationale": "RLHF/RLVR implementation"},
|
| 261 |
+
{"component": "Inference", "recommendation": "vLLM or SGLang", "rationale": "Production-ready, NOT HF Transformers in prod"},
|
| 262 |
+
{"component": "Edge", "recommendation": "Quantized models + sparse attention", "rationale": "Privacy-first deployment"},
|
| 263 |
+
{"component": "Safety", "recommendation": "Constitutional AI principles + verification layers", "rationale": "Medical device compliance, 'allowed to fail never'"},
|
| 264 |
+
],
|
| 265 |
+
"key_principles": [
|
| 266 |
+
{
|
| 267 |
+
"principle": "Domain-specific data is the moat",
|
| 268 |
+
"detail": "Frontier labs won't build rehab-specific models. Proprietary outcome data that nobody else can access is the defensible advantage.",
|
| 269 |
+
"source": "Lex Fridman Podcast #490 — Sebastian Raschka",
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"principle": "Post-training over pre-training",
|
| 273 |
+
"detail": "Don't spend on pre-training. Use open base models + invest in post-training (RLVR) where outcome data becomes the competitive edge.",
|
| 274 |
+
"source": "Lex Fridman Podcast #490",
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"principle": "Data quality over quantity",
|
| 278 |
+
"detail": "Curate aggressively. Reformat existing data into multiple training formats. Clean > big.",
|
| 279 |
+
"source": "Lex Fridman Podcast #490 — Nathan Lambert",
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"principle": "Human verification mandatory for medical AI",
|
| 283 |
+
"detail": "Tool-calling and autonomous agents still require human-in-the-loop for trust/safety in clinical context.",
|
| 284 |
+
"source": "Lex Fridman Podcast #490",
|
| 285 |
+
},
|
| 286 |
+
{
|
| 287 |
+
"principle": "High precision is non-negotiable",
|
| 288 |
+
"detail": "Medical rehab is a 'fail never' domain. Models must be verifiable, with intermediate state checking (world model approach).",
|
| 289 |
+
"source": "Lex Fridman Podcast #490 — Lex Fridman",
|
| 290 |
+
},
|
| 291 |
+
],
|
| 292 |
+
"sources_analyzed": [
|
| 293 |
+
{
|
| 294 |
+
"name": "Lex Fridman Podcast #490 — State of AI in 2026",
|
| 295 |
+
"type": "podcast",
|
| 296 |
+
"date": "2026-02-06",
|
| 297 |
+
"key_speakers": "Nathan Lambert (AI2), Sebastian Raschka",
|
| 298 |
+
"insights_extracted": 10,
|
| 299 |
+
},
|
| 300 |
+
],
|
| 301 |
+
"action_items": [
|
| 302 |
+
{"item": "Experiment with Qwen 3 / OLMo 3 on rehabilitation domain prompts", "status": "pending"},
|
| 303 |
+
{"item": "Structure outcome data for RLVR — create verifiable reward functions (gait improvement score, session completion rate)", "status": "pending"},
|
| 304 |
+
{"item": "Build tool schemas — APIs for patient data access, sensor queries, report generation", "status": "pending"},
|
| 305 |
+
{"item": "Set up vLLM for production inference", "status": "pending"},
|
| 306 |
+
{"item": "Study Nathan Lambert's RLHF book when released", "status": "pending"},
|
| 307 |
+
],
|
| 308 |
+
"metadata": {
|
| 309 |
+
"created": "2026-02-06",
|
| 310 |
+
"last_updated": "2026-02-06",
|
| 311 |
+
"version": 1,
|
| 312 |
+
},
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
# ============================================================
|
| 317 |
+
# SOTA Knowledge Agent
|
| 318 |
+
# ============================================================
|
| 319 |
+
|
| 320 |
+
class SOTAScoutAgent:
|
| 321 |
+
"""Maintains and updates GURMA.ai's SOTA technology knowledge base.
|
| 322 |
+
|
| 323 |
+
The agent understands that GURMA.ai operates in a high-precision medical
|
| 324 |
+
domain where model accuracy, verifiability, and safety are non-negotiable.
|
| 325 |
+
It filters all technology developments through this lens.
|
| 326 |
+
|
| 327 |
+
Usage:
|
| 328 |
+
agent = SOTAScoutAgent()
|
| 329 |
+
agent.show() # Print current KB state
|
| 330 |
+
agent.show("models") # Show tracked models
|
| 331 |
+
agent.analyze("notes/research/podcast.md") # Analyze + update KB
|
| 332 |
+
"""
|
| 333 |
+
|
| 334 |
+
def __init__(self, llm: LLMClient = None):
|
| 335 |
+
self.llm = llm or LLMClient()
|
| 336 |
+
self.kb_dir = RESEARCH_DIR / "sota"
|
| 337 |
+
self.kb_dir.mkdir(parents=True, exist_ok=True)
|
| 338 |
+
self.kb_path = self.kb_dir / "knowledge_base.json"
|
| 339 |
+
self.kb = self._load_kb()
|
| 340 |
+
|
| 341 |
+
# ----------------------------------------------------------
|
| 342 |
+
# Persistence
|
| 343 |
+
# ----------------------------------------------------------
|
| 344 |
+
|
| 345 |
+
def _load_kb(self) -> dict:
|
| 346 |
+
"""Load existing KB or initialize from seed."""
|
| 347 |
+
if self.kb_path.exists():
|
| 348 |
+
try:
|
| 349 |
+
with open(self.kb_path) as f:
|
| 350 |
+
return json.load(f)
|
| 351 |
+
except Exception:
|
| 352 |
+
pass
|
| 353 |
+
# First run — seed from initial knowledge
|
| 354 |
+
kb = json.loads(json.dumps(INITIAL_KNOWLEDGE_BASE))
|
| 355 |
+
self._save_kb(kb)
|
| 356 |
+
return kb
|
| 357 |
+
|
| 358 |
+
def _save_kb(self, kb: dict = None):
|
| 359 |
+
"""Persist knowledge base to disk."""
|
| 360 |
+
kb = kb or self.kb
|
| 361 |
+
kb["metadata"]["last_updated"] = datetime.now().strftime("%Y-%m-%d")
|
| 362 |
+
with open(self.kb_path, "w") as f:
|
| 363 |
+
json.dump(kb, f, indent=2)
|
| 364 |
+
|
| 365 |
+
# ----------------------------------------------------------
|
| 366 |
+
# Public: Show
|
| 367 |
+
# ----------------------------------------------------------
|
| 368 |
+
|
| 369 |
+
def show(self, section: str = None) -> dict:
|
| 370 |
+
"""Display current knowledge base state.
|
| 371 |
+
|
| 372 |
+
Args:
|
| 373 |
+
section: Optional — "models", "techniques", "stack", "principles",
|
| 374 |
+
"actions", "sources". None = summary of everything.
|
| 375 |
+
|
| 376 |
+
Returns: The KB data (also prints to stdout).
|
| 377 |
+
"""
|
| 378 |
+
if section == "models":
|
| 379 |
+
self._print_models()
|
| 380 |
+
elif section == "techniques":
|
| 381 |
+
self._print_techniques()
|
| 382 |
+
elif section == "stack":
|
| 383 |
+
self._print_stack()
|
| 384 |
+
elif section == "principles":
|
| 385 |
+
self._print_principles()
|
| 386 |
+
elif section == "actions":
|
| 387 |
+
self._print_actions()
|
| 388 |
+
elif section == "sources":
|
| 389 |
+
self._print_sources()
|
| 390 |
+
else:
|
| 391 |
+
self._print_summary()
|
| 392 |
+
|
| 393 |
+
return self.kb
|
| 394 |
+
|
| 395 |
+
def _print_summary(self):
|
| 396 |
+
meta = self.kb.get("metadata", {})
|
| 397 |
+
models = self.kb.get("models", [])
|
| 398 |
+
techniques = self.kb.get("techniques", [])
|
| 399 |
+
stack = self.kb.get("tech_stack", [])
|
| 400 |
+
principles = self.kb.get("key_principles", [])
|
| 401 |
+
actions = self.kb.get("action_items", [])
|
| 402 |
+
sources = self.kb.get("sources_analyzed", [])
|
| 403 |
+
|
| 404 |
+
print(f"\n{'='*60}")
|
| 405 |
+
print(f"GURMA.ai SOTA Knowledge Base")
|
| 406 |
+
print(f"{'='*60}")
|
| 407 |
+
print(f"Last updated: {meta.get('last_updated', 'unknown')}")
|
| 408 |
+
print(f"Version: {meta.get('version', 0)}")
|
| 409 |
+
print(f"")
|
| 410 |
+
print(f" Models tracked: {len(models)}")
|
| 411 |
+
print(f" Techniques tracked: {len(techniques)}")
|
| 412 |
+
print(f" Tech stack items: {len(stack)}")
|
| 413 |
+
print(f" Key principles: {len(principles)}")
|
| 414 |
+
print(f" Action items: {len(actions)} ({sum(1 for a in actions if a.get('status') == 'pending')} pending)")
|
| 415 |
+
print(f" Sources analyzed: {len(sources)}")
|
| 416 |
+
print(f"")
|
| 417 |
+
|
| 418 |
+
rec_models = [m for m in models if m.get("status") == "recommended"]
|
| 419 |
+
if rec_models:
|
| 420 |
+
print(f"Recommended models:")
|
| 421 |
+
for m in rec_models:
|
| 422 |
+
print(f" * {m['name']} ({m.get('params', '?')}) — {m.get('gurma_fit', '')[:80]}")
|
| 423 |
+
|
| 424 |
+
high_tech = [t for t in techniques if t.get("priority") == "high"]
|
| 425 |
+
if high_tech:
|
| 426 |
+
print(f"\nHigh-priority techniques:")
|
| 427 |
+
for t in high_tech:
|
| 428 |
+
print(f" * {t['name']} — {t.get('gurma_fit', '')[:80]}")
|
| 429 |
+
|
| 430 |
+
pending = [a for a in actions if a.get("status") == "pending"]
|
| 431 |
+
if pending:
|
| 432 |
+
print(f"\nPending action items:")
|
| 433 |
+
for a in pending[:5]:
|
| 434 |
+
print(f" [ ] {a['item']}")
|
| 435 |
+
|
| 436 |
+
print(f"\n{'='*60}")
|
| 437 |
+
|
| 438 |
+
def _print_models(self):
|
| 439 |
+
print(f"\n--- Tracked Models ---\n")
|
| 440 |
+
for m in self.kb.get("models", []):
|
| 441 |
+
status_icon = {"recommended": "*", "watch": "~", "deprecated": "x"}.get(m.get("status", ""), "?")
|
| 442 |
+
print(f" [{status_icon}] {m['name']} ({m.get('params', '?')})")
|
| 443 |
+
print(f" Why: {m.get('why', '')}")
|
| 444 |
+
print(f" GURMA.ai fit: {m.get('gurma_fit', '')}")
|
| 445 |
+
print(f" Source: {m.get('source', '')} | Added: {m.get('added', '')}")
|
| 446 |
+
print()
|
| 447 |
+
|
| 448 |
+
def _print_techniques(self):
|
| 449 |
+
print(f"\n--- Tracked Techniques ---\n")
|
| 450 |
+
for t in self.kb.get("techniques", []):
|
| 451 |
+
pri = {"high": "!!!", "medium": "!!", "low": "!"}.get(t.get("priority", ""), "?")
|
| 452 |
+
print(f" [{pri}] {t['name']} ({t.get('category', '')})")
|
| 453 |
+
print(f" Why: {t.get('why', '')}")
|
| 454 |
+
print(f" GURMA.ai fit: {t.get('gurma_fit', '')}")
|
| 455 |
+
print()
|
| 456 |
+
|
| 457 |
+
def _print_stack(self):
|
| 458 |
+
print(f"\n--- Recommended Tech Stack ---\n")
|
| 459 |
+
for s in self.kb.get("tech_stack", []):
|
| 460 |
+
print(f" {s['component']:20s} -> {s['recommendation']}")
|
| 461 |
+
print(f" {'':20s} {s.get('rationale', '')}")
|
| 462 |
+
print()
|
| 463 |
+
|
| 464 |
+
def _print_principles(self):
|
| 465 |
+
print(f"\n--- Key Principles ---\n")
|
| 466 |
+
for p in self.kb.get("key_principles", []):
|
| 467 |
+
print(f" * {p['principle']}")
|
| 468 |
+
print(f" {p.get('detail', '')}")
|
| 469 |
+
print()
|
| 470 |
+
|
| 471 |
+
def _print_actions(self):
|
| 472 |
+
print(f"\n--- Action Items ---\n")
|
| 473 |
+
for a in self.kb.get("action_items", []):
|
| 474 |
+
icon = "[x]" if a.get("status") == "done" else "[ ]"
|
| 475 |
+
print(f" {icon} {a['item']}")
|
| 476 |
+
|
| 477 |
+
def _print_sources(self):
|
| 478 |
+
print(f"\n--- Analyzed Sources ---\n")
|
| 479 |
+
for s in self.kb.get("sources_analyzed", []):
|
| 480 |
+
print(f" {s.get('date', '?')} | {s['name']} ({s.get('type', '')})")
|
| 481 |
+
if s.get("key_speakers"):
|
| 482 |
+
print(f" Speakers: {s['key_speakers']}")
|
| 483 |
+
print(f" Insights extracted: {s.get('insights_extracted', 0)}")
|
| 484 |
+
print()
|
| 485 |
+
|
| 486 |
+
# ----------------------------------------------------------
|
| 487 |
+
# Public: Analyze document and update KB
|
| 488 |
+
# ----------------------------------------------------------
|
| 489 |
+
|
| 490 |
+
def analyze(self, file_path: str) -> Path:
|
| 491 |
+
"""Analyze a document for GURMA.ai-relevant SOTA insights and update KB.
|
| 492 |
+
|
| 493 |
+
Reads the file, scores sections for relevance, uses LLM to extract
|
| 494 |
+
structured insights, and merges new findings into the knowledge base.
|
| 495 |
+
|
| 496 |
+
Args:
|
| 497 |
+
file_path: Path to markdown/text file
|
| 498 |
+
|
| 499 |
+
Returns: Path to generated analysis report
|
| 500 |
+
"""
|
| 501 |
+
try:
|
| 502 |
+
from .config import PROJECT_ROOT
|
| 503 |
+
except ImportError:
|
| 504 |
+
from config import PROJECT_ROOT
|
| 505 |
+
|
| 506 |
+
path = Path(file_path)
|
| 507 |
+
if not path.is_absolute():
|
| 508 |
+
path = PROJECT_ROOT / file_path
|
| 509 |
+
|
| 510 |
+
if not path.exists():
|
| 511 |
+
raise FileNotFoundError(f"File not found: {path}")
|
| 512 |
+
|
| 513 |
+
print(f"\n{'='*60}")
|
| 514 |
+
print(f"Analyzing: {path.name}")
|
| 515 |
+
print(f"{'='*60}\n")
|
| 516 |
+
|
| 517 |
+
text = path.read_text(encoding="utf-8")
|
| 518 |
+
|
| 519 |
+
# Score sections for relevance
|
| 520 |
+
sections = self._split_sections(text)
|
| 521 |
+
scored = []
|
| 522 |
+
for sec in sections:
|
| 523 |
+
if len(sec.strip()) < 50:
|
| 524 |
+
continue
|
| 525 |
+
score, tags = self._score_relevance(sec)
|
| 526 |
+
if score > 0:
|
| 527 |
+
scored.append({"text": sec.strip()[:500], "score": score, "tags": tags})
|
| 528 |
+
scored.sort(key=lambda x: -x["score"])
|
| 529 |
+
|
| 530 |
+
print(f"Sections: {len(sections)} total, {len(scored)} relevant")
|
| 531 |
+
|
| 532 |
+
# LLM extraction — structured insights for KB update
|
| 533 |
+
llm_update = None
|
| 534 |
+
if self.llm.enabled:
|
| 535 |
+
print("[LLM] Extracting structured insights...")
|
| 536 |
+
llm_update = self._extract_kb_updates(text, path.name)
|
| 537 |
+
if llm_update:
|
| 538 |
+
n_models = len(llm_update.get("new_models", []))
|
| 539 |
+
n_tech = len(llm_update.get("new_techniques", []))
|
| 540 |
+
n_actions = len(llm_update.get("new_action_items", []))
|
| 541 |
+
print(f"[LLM] Found: {n_models} models, {n_tech} techniques, {n_actions} action items")
|
| 542 |
+
else:
|
| 543 |
+
print("[INFO] LLM not available — relevance scoring only, no KB update")
|
| 544 |
+
|
| 545 |
+
# Merge into knowledge base
|
| 546 |
+
changes = self._merge_updates(llm_update, path.name, len(scored))
|
| 547 |
+
|
| 548 |
+
# Generate report
|
| 549 |
+
report_path = self._generate_report(path.name, scored, llm_update, changes)
|
| 550 |
+
|
| 551 |
+
# Save updated KB
|
| 552 |
+
self._save_kb()
|
| 553 |
+
|
| 554 |
+
print(f"\nKB updated: {changes}")
|
| 555 |
+
print(f"Report: {report_path}")
|
| 556 |
+
return report_path
|
| 557 |
+
|
| 558 |
+
# ----------------------------------------------------------
|
| 559 |
+
# Internal: Relevance scoring
|
| 560 |
+
# ----------------------------------------------------------
|
| 561 |
+
|
| 562 |
+
def _score_relevance(self, text: str) -> tuple[float, list[str]]:
|
| 563 |
+
"""Score text against GURMA.ai's high-precision domain themes.
|
| 564 |
+
|
| 565 |
+
Returns (score 0.0-1.0, matched signal tags).
|
| 566 |
+
"""
|
| 567 |
+
text_lower = text.lower()
|
| 568 |
+
total_weight = 0
|
| 569 |
+
max_possible = sum(s["weight"] for s in GURMA_RELEVANCE_SIGNALS.values())
|
| 570 |
+
matched_tags = []
|
| 571 |
+
|
| 572 |
+
for tag, signal in GURMA_RELEVANCE_SIGNALS.items():
|
| 573 |
+
for pattern in signal["patterns"]:
|
| 574 |
+
if re.search(pattern, text_lower, re.IGNORECASE):
|
| 575 |
+
total_weight += signal["weight"]
|
| 576 |
+
matched_tags.append(tag)
|
| 577 |
+
break
|
| 578 |
+
|
| 579 |
+
score = min(total_weight / max_possible, 1.0) if max_possible > 0 else 0.0
|
| 580 |
+
return round(score, 3), matched_tags
|
| 581 |
+
|
| 582 |
+
def _split_sections(self, text: str) -> list[str]:
|
| 583 |
+
"""Split markdown into logical sections."""
|
| 584 |
+
sections = re.split(r'\n#{1,3}\s+', text)
|
| 585 |
+
return [s.strip() for s in sections if s.strip()]
|
| 586 |
+
|
| 587 |
+
# ----------------------------------------------------------
|
| 588 |
+
# Internal: LLM extraction for KB update
|
| 589 |
+
# ----------------------------------------------------------
|
| 590 |
+
|
| 591 |
+
def _extract_kb_updates(self, text: str, filename: str) -> Optional[dict]:
|
| 592 |
+
"""Use LLM to extract structured KB updates from a document.
|
| 593 |
+
|
| 594 |
+
Returns dict with new_models, new_techniques, stack_updates,
|
| 595 |
+
new_principles, new_action_items, and strategic_note.
|
| 596 |
+
"""
|
| 597 |
+
truncated = text[:12000]
|
| 598 |
+
|
| 599 |
+
# Include current KB state so LLM can detect what's truly new
|
| 600 |
+
current_models = ", ".join(m["name"] for m in self.kb.get("models", []))
|
| 601 |
+
current_techniques = ", ".join(t["name"] for t in self.kb.get("techniques", []))
|
| 602 |
+
|
| 603 |
+
system = (
|
| 604 |
+
"You are the technology intelligence agent for GURMA.ai, a Swiss AI company "
|
| 605 |
+
"building high-precision models for rehabilitation robotics. "
|
| 606 |
+
"GURMA.ai has 15 years of patient outcome data (gait dynamics, EMG, recovery outcomes) "
|
| 607 |
+
"from BAMA Teknoloji. Their domain requires SUPER-HIGH PRECISION — "
|
| 608 |
+
"wrong therapy parameters can harm patients. "
|
| 609 |
+
"Architecture: privacy-first edge computing. "
|
| 610 |
+
"Regulatory: EU AI Act (high-risk), MDR, ISO 13485. "
|
| 611 |
+
"Core thesis: proprietary outcome data + domain expertise + safety focus = defensible moat. "
|
| 612 |
+
"Your job: extract technology insights that help GURMA.ai build better, "
|
| 613 |
+
"safer, more precise models for this domain."
|
| 614 |
+
)
|
| 615 |
+
|
| 616 |
+
prompt = f"""Analyze this document and extract NEW technology insights for GURMA.ai's knowledge base.
|
| 617 |
+
|
| 618 |
+
Document: {filename}
|
| 619 |
+
---
|
| 620 |
+
{truncated}
|
| 621 |
+
---
|
| 622 |
+
|
| 623 |
+
Already tracked models: {current_models}
|
| 624 |
+
Already tracked techniques: {current_techniques}
|
| 625 |
+
|
| 626 |
+
Extract ONLY information that is NEW or updates existing knowledge.
|
| 627 |
+
Focus on what matters for a high-precision medical AI domain.
|
| 628 |
+
|
| 629 |
+
Return JSON:
|
| 630 |
+
{{
|
| 631 |
+
"new_models": [
|
| 632 |
+
{{
|
| 633 |
+
"name": "Model name",
|
| 634 |
+
"params": "Size/params",
|
| 635 |
+
"why": "Why it matters in general",
|
| 636 |
+
"gurma_fit": "Specific fit for GURMA.ai's high-precision rehab domain",
|
| 637 |
+
"status": "recommended|watch|deprecated"
|
| 638 |
+
}}
|
| 639 |
+
],
|
| 640 |
+
"new_techniques": [
|
| 641 |
+
{{
|
| 642 |
+
"name": "Technique name",
|
| 643 |
+
"category": "post-training|fine-tuning|data-processing|efficiency|simulation|safety",
|
| 644 |
+
"why": "Why it matters",
|
| 645 |
+
"gurma_fit": "How GURMA.ai should use it for high-precision medical AI",
|
| 646 |
+
"priority": "high|medium|low"
|
| 647 |
+
}}
|
| 648 |
+
],
|
| 649 |
+
"stack_updates": [
|
| 650 |
+
{{
|
| 651 |
+
"component": "Which tech stack component to update",
|
| 652 |
+
"recommendation": "New recommendation",
|
| 653 |
+
"rationale": "Why this change"
|
| 654 |
+
}}
|
| 655 |
+
],
|
| 656 |
+
"new_principles": [
|
| 657 |
+
{{
|
| 658 |
+
"principle": "Short principle statement",
|
| 659 |
+
"detail": "Explanation and evidence"
|
| 660 |
+
}}
|
| 661 |
+
],
|
| 662 |
+
"new_action_items": [
|
| 663 |
+
"Concrete next step for GURMA.ai"
|
| 664 |
+
],
|
| 665 |
+
"strategic_note": "How this document affects GURMA.ai's strategy (1-2 sentences, or null if no change)"
|
| 666 |
+
}}
|
| 667 |
+
|
| 668 |
+
Rules:
|
| 669 |
+
- ONLY include genuinely new information not already in the tracked lists
|
| 670 |
+
- Every item must connect to GURMA.ai's HIGH-PRECISION medical domain
|
| 671 |
+
- If a model/technique is already tracked, skip it (don't duplicate)
|
| 672 |
+
- If existing knowledge should be UPDATED (e.g. new version), include it with the update
|
| 673 |
+
- Be specific: name versions, papers, benchmarks
|
| 674 |
+
- Empty arrays are fine if nothing new is found"""
|
| 675 |
+
|
| 676 |
+
response = self.llm.call(prompt, system, max_tokens=3000)
|
| 677 |
+
if response:
|
| 678 |
+
match = re.search(r'\{.*\}', response, re.DOTALL)
|
| 679 |
+
if match:
|
| 680 |
+
try:
|
| 681 |
+
return json.loads(match.group())
|
| 682 |
+
except Exception:
|
| 683 |
+
pass
|
| 684 |
+
return None
|
| 685 |
+
|
| 686 |
+
# ----------------------------------------------------------
|
| 687 |
+
# Internal: Merge updates into KB
|
| 688 |
+
# ----------------------------------------------------------
|
| 689 |
+
|
| 690 |
+
def _merge_updates(self, llm_update: Optional[dict], source_name: str,
|
| 691 |
+
insights_count: int) -> dict:
|
| 692 |
+
"""Merge LLM-extracted updates into the knowledge base.
|
| 693 |
+
|
| 694 |
+
Returns summary of changes made.
|
| 695 |
+
"""
|
| 696 |
+
changes = {"models_added": 0, "techniques_added": 0,
|
| 697 |
+
"stack_updated": 0, "principles_added": 0,
|
| 698 |
+
"actions_added": 0}
|
| 699 |
+
|
| 700 |
+
# Record source
|
| 701 |
+
self.kb.setdefault("sources_analyzed", []).append({
|
| 702 |
+
"name": source_name,
|
| 703 |
+
"type": "document",
|
| 704 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 705 |
+
"insights_extracted": insights_count,
|
| 706 |
+
})
|
| 707 |
+
|
| 708 |
+
if not llm_update:
|
| 709 |
+
return changes
|
| 710 |
+
|
| 711 |
+
today = datetime.now().strftime("%Y-%m-%d")
|
| 712 |
+
|
| 713 |
+
# Merge models
|
| 714 |
+
existing_names = {m["name"].lower() for m in self.kb.get("models", [])}
|
| 715 |
+
for m in llm_update.get("new_models", []):
|
| 716 |
+
if m.get("name", "").lower() not in existing_names:
|
| 717 |
+
m["added"] = today
|
| 718 |
+
m["source"] = source_name
|
| 719 |
+
self.kb["models"].append(m)
|
| 720 |
+
changes["models_added"] += 1
|
| 721 |
+
|
| 722 |
+
# Merge techniques
|
| 723 |
+
existing_tech = {t["name"].lower() for t in self.kb.get("techniques", [])}
|
| 724 |
+
for t in llm_update.get("new_techniques", []):
|
| 725 |
+
if t.get("name", "").lower() not in existing_tech:
|
| 726 |
+
t["added"] = today
|
| 727 |
+
t["source"] = source_name
|
| 728 |
+
self.kb["techniques"].append(t)
|
| 729 |
+
changes["techniques_added"] += 1
|
| 730 |
+
|
| 731 |
+
# Stack updates — replace matching components
|
| 732 |
+
for su in llm_update.get("stack_updates", []):
|
| 733 |
+
component = su.get("component", "")
|
| 734 |
+
updated = False
|
| 735 |
+
for i, existing in enumerate(self.kb.get("tech_stack", [])):
|
| 736 |
+
if existing["component"].lower() == component.lower():
|
| 737 |
+
self.kb["tech_stack"][i] = su
|
| 738 |
+
updated = True
|
| 739 |
+
changes["stack_updated"] += 1
|
| 740 |
+
break
|
| 741 |
+
if not updated and component:
|
| 742 |
+
self.kb["tech_stack"].append(su)
|
| 743 |
+
changes["stack_updated"] += 1
|
| 744 |
+
|
| 745 |
+
# Merge principles
|
| 746 |
+
existing_principles = {p["principle"].lower() for p in self.kb.get("key_principles", [])}
|
| 747 |
+
for p in llm_update.get("new_principles", []):
|
| 748 |
+
if p.get("principle", "").lower() not in existing_principles:
|
| 749 |
+
p["source"] = source_name
|
| 750 |
+
self.kb["key_principles"].append(p)
|
| 751 |
+
changes["principles_added"] += 1
|
| 752 |
+
|
| 753 |
+
# Merge action items
|
| 754 |
+
existing_actions = {a["item"].lower() for a in self.kb.get("action_items", [])}
|
| 755 |
+
for ai in llm_update.get("new_action_items", []):
|
| 756 |
+
if ai.lower() not in existing_actions:
|
| 757 |
+
self.kb["action_items"].append({"item": ai, "status": "pending"})
|
| 758 |
+
changes["actions_added"] += 1
|
| 759 |
+
|
| 760 |
+
# Bump version
|
| 761 |
+
self.kb["metadata"]["version"] = self.kb["metadata"].get("version", 0) + 1
|
| 762 |
+
|
| 763 |
+
return changes
|
| 764 |
+
|
| 765 |
+
# ----------------------------------------------------------
|
| 766 |
+
# Internal: Report generation
|
| 767 |
+
# ----------------------------------------------------------
|
| 768 |
+
|
| 769 |
+
def _generate_report(self, filename: str, scored_sections: list[dict],
|
| 770 |
+
llm_update: Optional[dict], changes: dict) -> Path:
|
| 771 |
+
"""Generate analysis report as markdown."""
|
| 772 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 773 |
+
slug = re.sub(r'[^a-z0-9]', '-', filename.lower().rsplit('.', 1)[0])[:40]
|
| 774 |
+
report_path = self.kb_dir / f"{timestamp}_{slug}_analysis.md"
|
| 775 |
+
|
| 776 |
+
lines = [
|
| 777 |
+
f"# SOTA Analysis: {filename}",
|
| 778 |
+
"",
|
| 779 |
+
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')} ",
|
| 780 |
+
f"**Source:** {filename} ",
|
| 781 |
+
f"**Relevant sections:** {len(scored_sections)} ",
|
| 782 |
+
f"**KB version:** {self.kb['metadata'].get('version', '?')} ",
|
| 783 |
+
"",
|
| 784 |
+
]
|
| 785 |
+
|
| 786 |
+
# Changes summary
|
| 787 |
+
total_changes = sum(changes.values())
|
| 788 |
+
if total_changes > 0:
|
| 789 |
+
lines.append("## Knowledge Base Updates")
|
| 790 |
+
lines.append("")
|
| 791 |
+
if changes["models_added"]:
|
| 792 |
+
lines.append(f"- **{changes['models_added']}** new model(s) added")
|
| 793 |
+
if changes["techniques_added"]:
|
| 794 |
+
lines.append(f"- **{changes['techniques_added']}** new technique(s) added")
|
| 795 |
+
if changes["stack_updated"]:
|
| 796 |
+
lines.append(f"- **{changes['stack_updated']}** tech stack update(s)")
|
| 797 |
+
if changes["principles_added"]:
|
| 798 |
+
lines.append(f"- **{changes['principles_added']}** new principle(s)")
|
| 799 |
+
if changes["actions_added"]:
|
| 800 |
+
lines.append(f"- **{changes['actions_added']}** new action item(s)")
|
| 801 |
+
lines.append("")
|
| 802 |
+
else:
|
| 803 |
+
lines.append("*No new knowledge extracted (document may cover already-tracked topics).*")
|
| 804 |
+
lines.append("")
|
| 805 |
+
|
| 806 |
+
# LLM-extracted details
|
| 807 |
+
if llm_update:
|
| 808 |
+
if llm_update.get("strategic_note"):
|
| 809 |
+
lines.append("## Strategic Note")
|
| 810 |
+
lines.append(llm_update["strategic_note"])
|
| 811 |
+
lines.append("")
|
| 812 |
+
|
| 813 |
+
for m in llm_update.get("new_models", []):
|
| 814 |
+
lines.append(f"### New Model: {m.get('name', '?')}")
|
| 815 |
+
lines.append(f"- **Params:** {m.get('params', '?')}")
|
| 816 |
+
lines.append(f"- **Why:** {m.get('why', '')}")
|
| 817 |
+
lines.append(f"- **GURMA.ai fit:** {m.get('gurma_fit', '')}")
|
| 818 |
+
lines.append("")
|
| 819 |
+
|
| 820 |
+
for t in llm_update.get("new_techniques", []):
|
| 821 |
+
lines.append(f"### New Technique: {t.get('name', '?')}")
|
| 822 |
+
lines.append(f"- **Category:** {t.get('category', '?')}")
|
| 823 |
+
lines.append(f"- **Why:** {t.get('why', '')}")
|
| 824 |
+
lines.append(f"- **GURMA.ai fit:** {t.get('gurma_fit', '')}")
|
| 825 |
+
lines.append(f"- **Priority:** {t.get('priority', '?')}")
|
| 826 |
+
lines.append("")
|
| 827 |
+
|
| 828 |
+
if llm_update.get("new_action_items"):
|
| 829 |
+
lines.append("## New Action Items")
|
| 830 |
+
lines.append("")
|
| 831 |
+
for ai in llm_update["new_action_items"]:
|
| 832 |
+
lines.append(f"- [ ] {ai}")
|
| 833 |
+
lines.append("")
|
| 834 |
+
|
| 835 |
+
# Relevance-scored sections
|
| 836 |
+
if scored_sections:
|
| 837 |
+
lines.append("---")
|
| 838 |
+
lines.append("")
|
| 839 |
+
lines.append("## Relevance-Scored Sections")
|
| 840 |
+
lines.append("")
|
| 841 |
+
for s in scored_sections[:10]:
|
| 842 |
+
tags_str = ", ".join(s["tags"])
|
| 843 |
+
lines.append(f"**Score: {s['score']:.2f}** — tags: {tags_str}")
|
| 844 |
+
lines.append(f"> {s['text'][:300]}")
|
| 845 |
+
lines.append("")
|
| 846 |
+
|
| 847 |
+
with open(report_path, "w") as f:
|
| 848 |
+
f.write("\n".join(lines))
|
| 849 |
+
|
| 850 |
+
return report_path
|
tr_agents.py
ADDED
|
@@ -0,0 +1,480 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
GURMA.ai Turkish Research Agents - v2 (Enhanced)
|
| 4 |
+
|
| 5 |
+
Two specialized agents for Turkish-language web research, enhanced with strategic
|
| 6 |
+
context for Gurma AI's market entry.
|
| 7 |
+
|
| 8 |
+
1. **MaliMusavirAgent**: Researches company formation, tax, IP, and medical
|
| 9 |
+
device regulations.
|
| 10 |
+
|
| 11 |
+
2. **FonArastirmaAgent**: Researches R&D funding, focusing on bilateral
|
| 12 |
+
Swiss-Turkish opportunities and leveraging the BAMA partnership.
|
| 13 |
+
|
| 14 |
+
Both agents search in Turkish and produce structured data for a Cursor agent
|
| 15 |
+
(e.g., Claude Opus) to synthesize into actionable reports.
|
| 16 |
+
|
| 17 |
+
Usage:
|
| 18 |
+
# Ensure gurma-context.md is in the same directory
|
| 19 |
+
python research.py mali # Full company formation research
|
| 20 |
+
python research.py fonlar -c tubitak # Specific funding category
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from __future__ import annotations
|
| 24 |
+
|
| 25 |
+
import json
|
| 26 |
+
import re
|
| 27 |
+
import time
|
| 28 |
+
from dataclasses import dataclass, field
|
| 29 |
+
from datetime import datetime
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
from .search import SearchService
|
| 34 |
+
from .config import RESEARCH_DIR
|
| 35 |
+
except ImportError:
|
| 36 |
+
from search import SearchService
|
| 37 |
+
from config import RESEARCH_DIR
|
| 38 |
+
|
| 39 |
+
# ============================================================
|
| 40 |
+
# Turkish Authoritative Source Domains
|
| 41 |
+
# ============================================================
|
| 42 |
+
|
| 43 |
+
TR_PRIMARY_SOURCES = {
|
| 44 |
+
"tubitak.gov.tr", "kosgeb.gov.tr", "sanayi.gov.tr",
|
| 45 |
+
"ticaret.gov.tr", "ticaretsicil.gov.tr", "gib.gov.tr",
|
| 46 |
+
"resmigazete.gov.tr", "mevzuat.gov.tr", "iskur.gov.tr",
|
| 47 |
+
"yatirimadestek.gov.tr", "teydeb.tubitak.gov.tr",
|
| 48 |
+
"teknokent.org.tr", "teknopark.gov.tr", # .org.tr is also common
|
| 49 |
+
"btk.gov.tr", "kvkk.gov.tr", "titck.gov.tr", # Turkish Medicines and Medical Devices Agency
|
| 50 |
+
"ailevecalisma.gov.tr", "sgk.gov.tr",
|
| 51 |
+
"invest.gov.tr", # Investment Office of Turkey
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ============================================================
|
| 56 |
+
# Agent 1: Mali Müşavir — Company Formation & Regulation
|
| 57 |
+
# ============================================================
|
| 58 |
+
|
| 59 |
+
MALI_CATEGORIES = {
|
| 60 |
+
"sirket_kurulum": {
|
| 61 |
+
"label": "Şirket Kuruluş Adımları",
|
| 62 |
+
"queries": [
|
| 63 |
+
"yabancı sermayeli teknoloji şirketi kuruluş adımları türkiye 2025 2026",
|
| 64 |
+
"limited şirket (ltd) ve anonim şirket (aş) kuruluş prosedürleri",
|
| 65 |
+
"türkiye'de şirket kurmak için gerekli belgeler ticaret sicil noter",
|
| 66 |
+
"isviçre merkezli bir şirketin türkiye'de şube veya yan kuruluş açması",
|
| 67 |
+
"online şirket kuruluşu mümkün mü MERSİS süreci",
|
| 68 |
+
],
|
| 69 |
+
},
|
| 70 |
+
"sirket_turu": {
|
| 71 |
+
"label": "Şirket Türü Seçimi (AR-GE Odaklı)",
|
| 72 |
+
"queries": [
|
| 73 |
+
"ltd mi aş mi AR-GE ve yazılım şirketi için karşılaştırma 2026",
|
| 74 |
+
"anonim şirket ve limited şirket vergi ve sorumluluk farkları",
|
| 75 |
+
"yabancı ortaklı şirketler için en uygun şirket türü türkiye",
|
| 76 |
+
"devlet teşvikleri ve fonlara erişim için şirket türü önemli mi",
|
| 77 |
+
],
|
| 78 |
+
},
|
| 79 |
+
"vergi_tesvik": {
|
| 80 |
+
"label": "Vergi ve Teşvikler (Teknoloji)",
|
| 81 |
+
"queries": [
|
| 82 |
+
"5746 sayılı AR-GE kanunu teşvikleri güncel 2025 2026",
|
| 83 |
+
"teknokent dışı AR-GE merkezi vergi avantajları",
|
| 84 |
+
"yazılım ve yapay zeka ihracatı vergi istisnaları türkiye",
|
| 85 |
+
"kurumlar vergisi ve KDV istisnası teknoloji şirketleri",
|
| 86 |
+
"SGK işveren primi desteği AR-GE personeli için",
|
| 87 |
+
],
|
| 88 |
+
},
|
| 89 |
+
"teknokent_teknopark": {
|
| 90 |
+
"label": "Teknokent ve AR-GE Merkezleri",
|
| 91 |
+
"queries": [
|
| 92 |
+
"teknopark başvuru ve kabul kriterleri yapay zeka medikal cihaz",
|
| 93 |
+
"istanbul ankara izmir önde gelen teknoparklar ve uzmanlık alanları",
|
| 94 |
+
"teknopark avantajları vergi kira altyapı",
|
| 95 |
+
"AR-GE merkezi kurma şartları ve avantajları teknopark dışında",
|
| 96 |
+
"BAMA Teknoloji hangi teknoparkta yer alıyor",
|
| 97 |
+
],
|
| 98 |
+
},
|
| 99 |
+
"maliyet_surec": {
|
| 100 |
+
"label": "Maliyet ve Süreç Takvimi",
|
| 101 |
+
"queries": [
|
| 102 |
+
"türkiye'de şirket kuruluş toplam maliyeti 2026 (noter harç sermaye)",
|
| 103 |
+
"şirket kuruluş süresi ortalama kaç gün 2026",
|
| 104 |
+
"kuruluş sonrası zorunlu adımlar (SGK vergi dairesi belediye)",
|
| 105 |
+
"aylık sabit giderler teknoloji şirketi (muhasebe bağkur sgk)",
|
| 106 |
+
],
|
| 107 |
+
},
|
| 108 |
+
"ip_data_sovereignty": {
|
| 109 |
+
"label": "Fikri Mülkiyet ve Veri Mevzuatı",
|
| 110 |
+
"queries": [
|
| 111 |
+
"türkiye'de yazılım ve yapay zeka algoritması fikri mülkiyet koruması",
|
| 112 |
+
"KVKK (kişisel verilerin korunması kanunu) sağlık verileri yönetmeliği",
|
| 113 |
+
"sağlık verilerinin yurtdışına aktarımı KVKK izinler",
|
| 114 |
+
"anonimleştirilmiş veri ile AR-GE çalışması yasal çerçeve türkiye",
|
| 115 |
+
"isviçre-türkiye veri transferi anlaşmaları",
|
| 116 |
+
],
|
| 117 |
+
},
|
| 118 |
+
"regulatory_medical": {
|
| 119 |
+
"label": "Medikal Cihaz Mevzuatı (AI Odaklı)",
|
| 120 |
+
"queries": [
|
| 121 |
+
"TİTCK yapay zeka tabanlı yazılımlar için medikal cihaz düzenlemesi",
|
| 122 |
+
"türkiye medikal cihaz yönetmeliği (MDR) ve CE işareti tanınırlığı",
|
| 123 |
+
"yapay zeka rehabilitasyon cihazları için klinik araştırma gereklilikleri türkiye",
|
| 124 |
+
"tıbbi cihaz kayıt ve onay süreci TİTCK ÜTS sistemi",
|
| 125 |
+
"yazılım bir tıbbi cihaz mıdır (SaMD) türkiye sınıflandırması",
|
| 126 |
+
],
|
| 127 |
+
},
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
MALI_SYNTHESIS_QUESTIONS = {
|
| 131 |
+
"sirket_kurulum": [
|
| 132 |
+
"İsviçre merkezli Gurma AI için Türkiye'de bir yan kuruluş (subsidiary) kurmanın adımları nelerdir?",
|
| 133 |
+
"Gerekli ana belgeler nelerdir ve bu belgelerin İsviçre'den nasıl hazırlanması gerekir (apostil vb.)?",
|
| 134 |
+
"Sürecin ne kadarı uzaktan (online) yönetilebilir, hangi aşamalarda Türkiye'de fiziksel bulunma zorunludur?",
|
| 135 |
+
],
|
| 136 |
+
"sirket_turu": [
|
| 137 |
+
"Gurma'nın AR-GE ve fon odaklı hedefleri için Ltd. mi A.Ş. mi daha mantıklı? Karar matrisi oluşturun.",
|
| 138 |
+
"Minimum sermaye gereksinimleri ve bu sermayenin blokesi/kullanımı nasıl işliyor?",
|
| 139 |
+
"Seçilen şirket türü, gelecekte yatırımcı alma veya hisse devri operasyonlarını nasıl etkiler?",
|
| 140 |
+
],
|
| 141 |
+
"vergi_tesvik": [
|
| 142 |
+
"Gurma'nın yararlanabileceği temel vergi avantajları (Kurumlar, KDV, Gelir Vergisi Stopajı) nelerdir?",
|
| 143 |
+
"5746 sayılı kanun kapsamında, Teknopark içinde ve dışında olmanın avantaj/dezavantajları nelerdir?",
|
| 144 |
+
"Yapay zeka ve medikal cihaz ihracatı için özel bir vergi indirimi var mı?",
|
| 145 |
+
],
|
| 146 |
+
"teknokent_teknopark": [
|
| 147 |
+
"Gurma'nın profiline (AI + Medikal Cihaz) en uygun 3 Teknopark hangisidir ve neden?",
|
| 148 |
+
"Teknopark'a kabul için proje başvurusunda nelere dikkat edilmeli? BAMA'nın deneyiminden nasıl yararlanılır?",
|
| 149 |
+
"Teknopark'ta yer almanın IP koruması ve veri güvenliği açısından ek bir avantajı var mı?",
|
| 150 |
+
],
|
| 151 |
+
"maliyet_surec": [
|
| 152 |
+
"Bir A.Ş. kurmak için başlangıçta ne kadar bir bütçe ayrılmalı (minimum sermaye + masraflar)?",
|
| 153 |
+
"Şirketin yasal olarak faaliyete geçmesi ne kadar sürer? (iyimser ve kötümser senaryo)",
|
| 154 |
+
"Faaliyete geçtikten sonraki ilk 3 ay içinde tamamlanması gereken zorunlu işlemler nelerdir?",
|
| 155 |
+
],
|
| 156 |
+
"ip_data_sovereignty": [
|
| 157 |
+
"Gurma'nın temel IP'si olan AI modellerini Türkiye'de nasıl koruma altına alabiliriz? (Patent, telif hakkı vb.)",
|
| 158 |
+
"KVKK uyarınca, rehabilitasyon verilerini işlerken nelere dikkat edilmeli? Veri Türkiye'de mi kalmalı?",
|
| 159 |
+
"İsviçre'deki ana şirket ile Türkiye'deki AR-GE birimi arasında veri (özellikle anonimleştirilmiş hasta verisi) transferi için yasal zemin nedir?",
|
| 160 |
+
],
|
| 161 |
+
"regulatory_medical": [
|
| 162 |
+
"Gurma'nın AI yazılımı Türkiye'de bir 'tıbbi cihaz' olarak kabul edilecek mi? TİTCK'nın bu konudaki kriterleri nelerdir?",
|
| 163 |
+
"Eğer tıbbi cihaz ise, AB'den alınacak bir CE belgesi Türkiye'de doğrudan geçerli midir, yoksa ek TİTCK onayı gerekir mi?",
|
| 164 |
+
"Pazara sunmadan önce Türkiye'de bir klinik doğrulama/araştırma yapma zorunluluğu var mı?",
|
| 165 |
+
],
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
# ============================================================
|
| 170 |
+
# Agent 2: Fon Araştırma — TÜBİTAK & Stratejik Ortaklık
|
| 171 |
+
# ============================================================
|
| 172 |
+
|
| 173 |
+
FON_CATEGORIES = {
|
| 174 |
+
"tubitak": {
|
| 175 |
+
"label": "TÜBİTAK Destek Programları",
|
| 176 |
+
"queries": [
|
| 177 |
+
"TÜBİTAK TEYDEB 1501 1507 destek programları yapay zeka medikal cihaz 2026",
|
| 178 |
+
"TÜBİTAK 1702 patent lisanslama desteği yabancı teknoloji",
|
| 179 |
+
"TÜBİTAK yapay zeka enstitüsü proje çağrıları",
|
| 180 |
+
"TÜBİTAK sağlık bilimleri araştırma destek grubu (SBAG) çağrıları",
|
| 181 |
+
"yeni kurulan teknoloji şirketleri için TÜBİTAK BİGG programı şartları",
|
| 182 |
+
],
|
| 183 |
+
},
|
| 184 |
+
"kosgeb": {
|
| 185 |
+
"label": "KOSGEB Destekleri",
|
| 186 |
+
"queries": [
|
| 187 |
+
"KOSGEB AR-GE ÜR-GE ve inovasyon destek programı 2026",
|
| 188 |
+
"KOSGEB teknoloji odaklı sanayi hamlesi programı medikal cihaz",
|
| 189 |
+
"yabancı ortaklı KOBİ'ler KOSGEB desteklerinden yararlanabilir mi",
|
| 190 |
+
"KOSGEB stratejik ürün destek programı şartları",
|
| 191 |
+
],
|
| 192 |
+
},
|
| 193 |
+
"sanayi_bakanligi": {
|
| 194 |
+
"label": "Sanayi Bakanlığı ve Kalkınma Ajansları",
|
| 195 |
+
"queries": [
|
| 196 |
+
"sanayi ve teknoloji bakanlığı yatırım teşvik belgesi AR-GE yatırımı",
|
| 197 |
+
"ulusal yapay zeka stratejisi 2025 kapsamında açılan fon ve destekler",
|
| 198 |
+
"kalkınma ajansları (ISTKA IZMIRKA) güdümlü proje desteği sağlık teknolojileri",
|
| 199 |
+
"teknoloji geliştirme bölgeleri (TGB) ek destek ve hibeler",
|
| 200 |
+
],
|
| 201 |
+
},
|
| 202 |
+
"ab_fonlari": {
|
| 203 |
+
"label": "AB ve Uluslararası Fonlar (İsviçre-Türkiye Odaklı)",
|
| 204 |
+
"queries": [
|
| 205 |
+
"TÜBİTAK-SNSF (İsviçre) ikili işbirliği programı ve başvuru şartlar��",
|
| 206 |
+
"EUREKA Eurostars programı türkiye isviçre ortaklığı",
|
| 207 |
+
"Horizon Europe programına türkiye'den katılım ve yapay zeka sağlık çağrıları",
|
| 208 |
+
"İsviçre ve Türkiye'nin ortak katıldığı uluslararası AR-GE fonları",
|
| 209 |
+
],
|
| 210 |
+
},
|
| 211 |
+
"basvuru_surec": {
|
| 212 |
+
"label": "Başvuru Süreçleri ve Stratejileri",
|
| 213 |
+
"queries": [
|
| 214 |
+
"TÜBİTAK TEYDEB proje önerisi hazırlama kılavuzu ve hakem değerlendirme kriterleri",
|
| 215 |
+
"başarılı bir TÜBİTAK projesi bütçesi nasıl hazırlanır (personel makine hizmet alımı)",
|
| 216 |
+
"TÜBİTAK proje başvurularında sık yapılan hatalar ve reddedilme nedenleri",
|
| 217 |
+
"proje yürütücüsü ve AR-GE personeli nitelikleri TÜBİTAK kriterleri",
|
| 218 |
+
],
|
| 219 |
+
},
|
| 220 |
+
"bama_joint_strategy": {
|
| 221 |
+
"label": "BAMA Ortaklığı ile Stratejik Fon Başvurusu",
|
| 222 |
+
"queries": [
|
| 223 |
+
"TÜBİTAK ortaklı proje başvurusu nasıl yapılır (KOBİ-KOBİ işbirliği)",
|
| 224 |
+
"BAMA Teknoloji'nin 'yerli üretici' statüsü fon başvurularında avantaj sağlar mı",
|
| 225 |
+
"BAMA Teknoloji'nin tamamladığı TÜBİTAK veya KOSGEB projeleri var mı",
|
| 226 |
+
"Gurma (yeni) ve BAMA (tecrübeli) ortaklığında bir proje kurgusu nasıl olmalı",
|
| 227 |
+
],
|
| 228 |
+
},
|
| 229 |
+
"competitor_analysis_tr": {
|
| 230 |
+
"label": "Türkiye'deki Rakiplerin Fon Geçmişi",
|
| 231 |
+
"queries": [
|
| 232 |
+
"rehabilitasyon robotiği alanında TÜBİTAK desteği alan türk firmaları",
|
| 233 |
+
"Hocoma Ekso Bionics gibi yabancı firmaların türkiye'de aldığı teşvik var mı",
|
| 234 |
+
"medikal cihaz ve yazılım alanında başarılı KOSGEB projesi örnekleri",
|
| 235 |
+
"türkiye'de sağlık teknolojileri alanında yatırım alan startuplar ve aldıkları hibeler",
|
| 236 |
+
],
|
| 237 |
+
},
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
FON_SYNTHESIS_QUESTIONS = {
|
| 241 |
+
"tubitak": [
|
| 242 |
+
"Gurma'nın mevcut durumu (yeni kuruluş, AI/medikal odaklı) için en uygun 2-3 TÜBİTAK programı hangisidir?",
|
| 243 |
+
"Bu programların sağladığı hibe oranı, toplam bütçe ve proje süresi nedir?",
|
| 244 |
+
"Yakın zamanda açılacak veya şu an açık olan özel bir 'yapay zeka' veya 'sağlık teknolojileri' çağrısı var mı?",
|
| 245 |
+
],
|
| 246 |
+
"kosgeb": [
|
| 247 |
+
"Yeni kurulacak yabancı ortaklı bir şirket, KOSGEB'in hangi desteklerinden faydalanabilir?",
|
| 248 |
+
"KOSGEB mi TÜBİTAK mı? Gurma'nın AR-GE projesi için hangisi daha uygun bir başlangıç noktasıdır?",
|
| 249 |
+
],
|
| 250 |
+
"sanayi_bakanligi": [
|
| 251 |
+
"'Yatırım Teşvik Belgesi' almanın Gurma için en somut faydaları neler olur? Süreç ne kadar karmaşık?",
|
| 252 |
+
"İstanbul veya İzmir'deki Kalkınma Ajansları, Gurma'nın projesine özel bir destek sağlayabilir mi?",
|
| 253 |
+
],
|
| 254 |
+
"ab_fonlari": [
|
| 255 |
+
"**En Önemli Soru**: TÜBİTAK-SNSF (İsviçre) ikili işbirliği programının güncel durumu nedir? Başvuru tarihleri ve başarı oranları hakkında ne biliniyor?",
|
| 256 |
+
"Gurma (İsviçre) ve Gurma (Türkiye) arasında bir EUREKA projesi kurgulamak mümkün ve mantıklı mıdır?",
|
| 257 |
+
],
|
| 258 |
+
"basvuru_surec": [
|
| 259 |
+
"Bir TÜBİTAK 1501 proje başvurusunun ana adımları ve zaman çizelgesi nedir?",
|
| 260 |
+
"Hakemlerin projeyi değerlendirirken en çok dikkat ettiği 3 kritik nokta nedir? (Örn: İnovatif yön, ticarileşme potansiyeli)",
|
| 261 |
+
"Proje bütçesinde hangi harcamalar desteklenir, hangileri desteklenmez?",
|
| 262 |
+
],
|
| 263 |
+
"bama_joint_strategy": [
|
| 264 |
+
"Gurma ve BAMA'nın birlikte başvurabileceği en mantıklı fon hangisidir? Bu ortaklık başvuruda nasıl bir avantaj yaratır?",
|
| 265 |
+
"BAMA'nın mevcut deneyimi ve 'yerli üretici' kimliği, proje kabul şansını ne kadar artırır?",
|
| 266 |
+
"Ortak bir projede IP (fikri mülkiyet) paylaşımı nasıl düzenlenmelidir?",
|
| 267 |
+
],
|
| 268 |
+
"competitor_analysis_tr": [
|
| 269 |
+
"Türkiye'de rehabilitasyon teknolojileri alanında kimler devlet desteği alıyor? Bu projelerin odak noktaları neler?",
|
| 270 |
+
"Rakiplerin aldığı destekler, pazarın hangi yöne gittiğini ve hangi teknolojilerin önceliklendirildiğini gösteriyor mu?",
|
| 271 |
+
],
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
# ============================================================
|
| 276 |
+
# Shared Dataclasses and Base Agent
|
| 277 |
+
# (This section is largely unchanged from the original v1)
|
| 278 |
+
# ============================================================
|
| 279 |
+
|
| 280 |
+
@dataclass
|
| 281 |
+
class TRIntelSection:
|
| 282 |
+
"""A section of the Turkish research report."""
|
| 283 |
+
category: str
|
| 284 |
+
label: str
|
| 285 |
+
queries_executed: list = field(default_factory=list)
|
| 286 |
+
results: list = field(default_factory=list)
|
| 287 |
+
findings: list = field(default_factory=list)
|
| 288 |
+
gaps: list = field(default_factory=list)
|
| 289 |
+
sources: list = field(default_factory=list)
|
| 290 |
+
|
| 291 |
+
class TRResearchAgent:
|
| 292 |
+
"""Base agent for Turkish-language structured web research."""
|
| 293 |
+
|
| 294 |
+
CATEGORIES: dict = {}
|
| 295 |
+
SYNTHESIS_QUESTIONS: dict = {}
|
| 296 |
+
OUTPUT_SUBDIR: str = "tr"
|
| 297 |
+
REPORT_TITLE: str = "Türkçe Araştırma Raporu"
|
| 298 |
+
|
| 299 |
+
def __init__(self, search: SearchService = None):
|
| 300 |
+
self.search = search or SearchService()
|
| 301 |
+
self.sections: dict[str, TRIntelSection] = {}
|
| 302 |
+
self.output_dir = RESEARCH_DIR / self.OUTPUT_SUBDIR
|
| 303 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 304 |
+
|
| 305 |
+
def run(self, categories: list[str] = None, delay: float = 1.0, max_results: int = 10) -> Path:
|
| 306 |
+
cats_to_run = categories or list(self.CATEGORIES.keys())
|
| 307 |
+
|
| 308 |
+
total_queries = sum(len(self.CATEGORIES[c]["queries"]) for c in cats_to_run if c in self.CATEGORIES)
|
| 309 |
+
|
| 310 |
+
print(f"\n{'='*60}")
|
| 311 |
+
print(f"{self.REPORT_TITLE}")
|
| 312 |
+
print(f"Kategoriler: {len(cats_to_run)} | Sorgular: ~{total_queries}")
|
| 313 |
+
print(f"{'='*60}\n")
|
| 314 |
+
|
| 315 |
+
for cat_key in cats_to_run:
|
| 316 |
+
if cat_key not in self.CATEGORIES:
|
| 317 |
+
print(f"[ATLA] Bilinmeyen kategori: {cat_key}")
|
| 318 |
+
continue
|
| 319 |
+
|
| 320 |
+
cat_details = self.CATEGORIES[cat_key]
|
| 321 |
+
section = TRIntelSection(category=cat_key, label=cat_details["label"])
|
| 322 |
+
self._research_category(section, cat_details, delay, max_results)
|
| 323 |
+
self.sections[cat_key] = section
|
| 324 |
+
|
| 325 |
+
report_path = self._generate_report()
|
| 326 |
+
self._save_data()
|
| 327 |
+
|
| 328 |
+
print(f"\n{'='*60}")
|
| 329 |
+
print(f"Rapor ve Veri Dosyaları Oluşturuldu: {self.output_dir}")
|
| 330 |
+
total_findings = sum(len(s.findings) for s in self.sections.values())
|
| 331 |
+
print(f"Toplam Bulgular: {total_findings}")
|
| 332 |
+
print(f"{'='*60}\n")
|
| 333 |
+
|
| 334 |
+
return report_path
|
| 335 |
+
|
| 336 |
+
def _research_category(self, section: TRIntelSection, cat_details: dict, delay: float, max_results: int):
|
| 337 |
+
print(f"\n--- Kategori: {section.label} ---")
|
| 338 |
+
|
| 339 |
+
for query in cat_details["queries"]:
|
| 340 |
+
print(f" [ARAMA] {query}")
|
| 341 |
+
try:
|
| 342 |
+
results = self.search.search(query, max_results=max_results, save=False) # Disable saving intermediate results for now
|
| 343 |
+
section.queries_executed.append(query)
|
| 344 |
+
section.results.extend(results)
|
| 345 |
+
section.sources.extend(r.url for r in results if r.url and r.url not in section.sources)
|
| 346 |
+
print(f" -> {len(results)} sonuç bulundu.")
|
| 347 |
+
except Exception as e:
|
| 348 |
+
print(f" -> Arama sırasında hata: {e}")
|
| 349 |
+
|
| 350 |
+
if delay > 0:
|
| 351 |
+
time.sleep(delay)
|
| 352 |
+
|
| 353 |
+
section.findings = self._analyze_and_deduplicate(section)
|
| 354 |
+
section.gaps = self._detect_gaps(section)
|
| 355 |
+
|
| 356 |
+
confirmed_count = sum(1 for f in section.findings if f.get("confirmed"))
|
| 357 |
+
print(f" [ANALİZ] {len(section.findings)} özgün bulgu ({confirmed_count} resmi kaynaklı). Gaps: {len(section.gaps)}")
|
| 358 |
+
|
| 359 |
+
def _analyze_and_deduplicate(self, section: TRIntelSection) -> list[dict]:
|
| 360 |
+
findings = []
|
| 361 |
+
seen_snippets = set()
|
| 362 |
+
|
| 363 |
+
for r in sorted(section.results, key=lambda x: x.url):
|
| 364 |
+
# Basic deduplication based on snippet
|
| 365 |
+
snippet_key = re.sub(r'[^a-z0-9]', '', r.snippet.lower()[:100])
|
| 366 |
+
if snippet_key in seen_snippets:
|
| 367 |
+
continue
|
| 368 |
+
seen_snippets.add(snippet_key)
|
| 369 |
+
|
| 370 |
+
is_primary = any(domain in r.url for domain in TR_PRIMARY_SOURCES) if r.url else False
|
| 371 |
+
|
| 372 |
+
findings.append({
|
| 373 |
+
"text": f"{r.title}: {r.snippet}",
|
| 374 |
+
"confirmed": is_primary,
|
| 375 |
+
"source": r.url or "",
|
| 376 |
+
})
|
| 377 |
+
return findings[:25] # Limit findings per section
|
| 378 |
+
|
| 379 |
+
def _detect_gaps(self, section: TRIntelSection) -> list[dict]:
|
| 380 |
+
questions = self.SYNTHESIS_QUESTIONS.get(section.category, [])
|
| 381 |
+
if not questions:
|
| 382 |
+
return []
|
| 383 |
+
|
| 384 |
+
all_text = " ".join(f["text"].lower() for f in section.findings)
|
| 385 |
+
|
| 386 |
+
gaps = []
|
| 387 |
+
for q in questions:
|
| 388 |
+
# Simple keyword matching to detect gaps
|
| 389 |
+
keywords = [w for w in re.findall(r'\w{4,}', q.lower()) if len(w) > 3]
|
| 390 |
+
matches = sum(1 for kw in keywords if kw in all_text)
|
| 391 |
+
if not all_text or matches < len(keywords) * 0.2:
|
| 392 |
+
gaps.append({"text": q})
|
| 393 |
+
return gaps
|
| 394 |
+
|
| 395 |
+
def _generate_report(self) -> Path:
|
| 396 |
+
# This is a lightweight data dump; main report is synthesized by Cursor
|
| 397 |
+
timestamp = datetime.now().strftime("%Y-%m-%d")
|
| 398 |
+
slug = self.OUTPUT_SUBDIR.replace("/", "-")
|
| 399 |
+
report_path = self.output_dir / f"{slug}-rapor-{timestamp}.md"
|
| 400 |
+
|
| 401 |
+
lines = [f"# {self.REPORT_TITLE} - Veri Dökümü", f"Tarih: {datetime.now().isoformat()}", ""]
|
| 402 |
+
|
| 403 |
+
for cat_key, section in self.sections.items():
|
| 404 |
+
lines.extend([f"## {section.label}", ""])
|
| 405 |
+
lines.append("### Bulgular")
|
| 406 |
+
for f in section.findings:
|
| 407 |
+
tag = "✅" if f.get("confirmed") else "⚠️"
|
| 408 |
+
lines.append(f"- {tag} {f.get('text', '')} ([Kaynak]({f.get('source', '#')}))")
|
| 409 |
+
lines.append("\n### Cevaplanması Gereken Sentez Soruları")
|
| 410 |
+
for q in self.SYNTHESIS_QUESTIONS.get(cat_key, []):
|
| 411 |
+
lines.append(f"- {q}")
|
| 412 |
+
lines.append("")
|
| 413 |
+
|
| 414 |
+
report_path.write_text("\n".join(lines), encoding="utf-8")
|
| 415 |
+
return report_path
|
| 416 |
+
|
| 417 |
+
def _save_data(self):
|
| 418 |
+
timestamp = datetime.now().strftime("%Y-%m-%d")
|
| 419 |
+
slug = self.OUTPUT_SUBDIR.replace("/", "-")
|
| 420 |
+
data_path = self.output_dir / f"{slug}-data-{timestamp}.json"
|
| 421 |
+
|
| 422 |
+
data = {
|
| 423 |
+
"agent": self.__class__.__name__,
|
| 424 |
+
"timestamp": datetime.now().isoformat(),
|
| 425 |
+
"sections": {
|
| 426 |
+
key: {
|
| 427 |
+
"category": s.category,
|
| 428 |
+
"label": s.label,
|
| 429 |
+
"queries_executed": s.queries_executed,
|
| 430 |
+
"findings": s.findings,
|
| 431 |
+
"gaps": s.gaps,
|
| 432 |
+
"sources": s.sources,
|
| 433 |
+
"synthesis_questions": self.SYNTHESIS_QUESTIONS.get(key, []),
|
| 434 |
+
}
|
| 435 |
+
for key, s in self.sections.items()
|
| 436 |
+
},
|
| 437 |
+
}
|
| 438 |
+
data_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 439 |
+
|
| 440 |
+
def list_categories(self):
|
| 441 |
+
print("Mevcut kategoriler:")
|
| 442 |
+
for key, cat in self.CATEGORIES.items():
|
| 443 |
+
print(f" - {key}: {cat['label']}")
|
| 444 |
+
|
| 445 |
+
# ============================================================
|
| 446 |
+
# Agent Implementations
|
| 447 |
+
# ============================================================
|
| 448 |
+
|
| 449 |
+
class MaliMusavirAgent(TRResearchAgent):
|
| 450 |
+
"""Researches company formation, tax, IP, and medical device regulations."""
|
| 451 |
+
CATEGORIES = MALI_CATEGORIES
|
| 452 |
+
SYNTHESIS_QUESTIONS = MALI_SYNTHESIS_QUESTIONS
|
| 453 |
+
OUTPUT_SUBDIR = "tr-mali"
|
| 454 |
+
REPORT_TITLE = "Mali Müşavir Raporu — Şirket Kuruluşu ve Mevzuat"
|
| 455 |
+
|
| 456 |
+
class FonArastirmaAgent(TRResearchAgent):
|
| 457 |
+
"""Researches R&D funding, focusing on bilateral Swiss-Turkish opportunities."""
|
| 458 |
+
CATEGORIES = FON_CATEGORIES
|
| 459 |
+
SYNTHESIS_QUESTIONS = FON_SYNTHESIS_QUESTIONS
|
| 460 |
+
OUTPUT_SUBDIR = "tr-fonlar"
|
| 461 |
+
REPORT_TITLE = "Fon Araştırma Raporu — TÜBİTAK ve Stratejik Destekler"
|
| 462 |
+
|
| 463 |
+
if __name__ == "__main__":
|
| 464 |
+
import argparse as _ap
|
| 465 |
+
|
| 466 |
+
parser = _ap.ArgumentParser(description="GURMA.ai Turkish Research Agents")
|
| 467 |
+
parser.add_argument("agent", choices=["mali", "fonlar"], help="Agent to run")
|
| 468 |
+
parser.add_argument("-c", "--categories", nargs="+", help="Specific categories to run")
|
| 469 |
+
parser.add_argument("-d", "--delay", type=float, default=1.0, help="Delay between searches")
|
| 470 |
+
parser.add_argument("--list-categories", action="store_true")
|
| 471 |
+
|
| 472 |
+
args = parser.parse_args()
|
| 473 |
+
|
| 474 |
+
agent_map = {"mali": MaliMusavirAgent, "fonlar": FonArastirmaAgent}
|
| 475 |
+
agent_instance = agent_map[args.agent]()
|
| 476 |
+
|
| 477 |
+
if args.list_categories:
|
| 478 |
+
agent_instance.list_categories()
|
| 479 |
+
else:
|
| 480 |
+
agent_instance.run(categories=args.categories, delay=args.delay)
|
tr_tab.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GURMA.ai — Turkey Expansion Tab
|
| 3 |
+
|
| 4 |
+
Displays research results from Mali and Fonlar agents,
|
| 5 |
+
plus executive summary reports.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
import streamlit as st
|
| 14 |
+
|
| 15 |
+
# ============================================================
|
| 16 |
+
# Environment & Paths
|
| 17 |
+
# ============================================================
|
| 18 |
+
|
| 19 |
+
IS_HF_SPACE = os.getenv("HF_SPACE") or Path("/app/research.py").exists()
|
| 20 |
+
|
| 21 |
+
if IS_HF_SPACE:
|
| 22 |
+
DATA_ROOT = Path("/app/data")
|
| 23 |
+
DOCS_ROOT = Path("/app/docs")
|
| 24 |
+
else:
|
| 25 |
+
DATA_ROOT = Path(__file__).resolve().parent.parent.parent / "data"
|
| 26 |
+
DOCS_ROOT = Path(__file__).resolve().parent.parent.parent / "docs"
|
| 27 |
+
|
| 28 |
+
TR_MALI_DIR = DATA_ROOT / "tr-mali"
|
| 29 |
+
TR_FONLAR_DIR = DATA_ROOT / "tr-fonlar"
|
| 30 |
+
|
| 31 |
+
AGENT_CONFIG = {
|
| 32 |
+
"tr-mali": {
|
| 33 |
+
"label": "Mali Müşavir",
|
| 34 |
+
"icon": "🏛️",
|
| 35 |
+
"dir": TR_MALI_DIR,
|
| 36 |
+
"description": "Company formation, tax, IP, regulatory",
|
| 37 |
+
},
|
| 38 |
+
"tr-fonlar": {
|
| 39 |
+
"label": "Fon Araştırma",
|
| 40 |
+
"icon": "💰",
|
| 41 |
+
"dir": TR_FONLAR_DIR,
|
| 42 |
+
"description": "TÜBİTAK, KOSGEB, EU/bilateral funding",
|
| 43 |
+
},
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ============================================================
|
| 48 |
+
# Data Loading
|
| 49 |
+
# ============================================================
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@st.cache_data(ttl=120)
|
| 53 |
+
def _load_tr_data(agent_key: str) -> list[dict]:
|
| 54 |
+
"""Load all JSON data files for a TR agent, newest first."""
|
| 55 |
+
agent_dir = AGENT_CONFIG[agent_key]["dir"]
|
| 56 |
+
if not agent_dir.exists():
|
| 57 |
+
return []
|
| 58 |
+
|
| 59 |
+
results = []
|
| 60 |
+
for f in sorted(agent_dir.glob("*.json"), reverse=True):
|
| 61 |
+
try:
|
| 62 |
+
with open(f) as fh:
|
| 63 |
+
data = json.load(fh)
|
| 64 |
+
data["_filename"] = f.name
|
| 65 |
+
results.append(data)
|
| 66 |
+
except Exception:
|
| 67 |
+
continue
|
| 68 |
+
return results
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@st.cache_data(ttl=120)
|
| 72 |
+
def _load_exec_summaries() -> list[dict]:
|
| 73 |
+
"""Load executive summary markdown files, newest first."""
|
| 74 |
+
if not DOCS_ROOT.exists():
|
| 75 |
+
return []
|
| 76 |
+
|
| 77 |
+
summaries = []
|
| 78 |
+
for f in sorted(DOCS_ROOT.glob("exec-summary-*.md"), reverse=True):
|
| 79 |
+
try:
|
| 80 |
+
content = f.read_text(encoding="utf-8")
|
| 81 |
+
title_line = ""
|
| 82 |
+
for line in content.split("\n"):
|
| 83 |
+
if line.startswith("# "):
|
| 84 |
+
title_line = line[2:].strip()
|
| 85 |
+
break
|
| 86 |
+
summaries.append({
|
| 87 |
+
"filename": f.name,
|
| 88 |
+
"title": title_line or f.stem,
|
| 89 |
+
"content": content,
|
| 90 |
+
"mtime": datetime.fromtimestamp(f.stat().st_mtime),
|
| 91 |
+
})
|
| 92 |
+
except Exception:
|
| 93 |
+
continue
|
| 94 |
+
return summaries
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# ============================================================
|
| 98 |
+
# Rendering Helpers
|
| 99 |
+
# ============================================================
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _render_finding(finding: dict):
|
| 103 |
+
"""Render a single research finding with source quality indicator."""
|
| 104 |
+
text = finding.get("text", "")
|
| 105 |
+
source = finding.get("source", "")
|
| 106 |
+
confirmed = finding.get("confirmed", False)
|
| 107 |
+
|
| 108 |
+
if not text or len(text.strip()) < 20:
|
| 109 |
+
return
|
| 110 |
+
|
| 111 |
+
color = "#2ecc71" if confirmed else "#e67e22"
|
| 112 |
+
tag = "✅" if confirmed else "⚠️"
|
| 113 |
+
domain = ""
|
| 114 |
+
if source:
|
| 115 |
+
try:
|
| 116 |
+
from urllib.parse import urlparse
|
| 117 |
+
domain = urlparse(source).netloc
|
| 118 |
+
if domain.startswith("www."):
|
| 119 |
+
domain = domain[4:]
|
| 120 |
+
except Exception:
|
| 121 |
+
domain = source[:40]
|
| 122 |
+
|
| 123 |
+
truncated = text[:250] + "..." if len(text) > 250 else text
|
| 124 |
+
source_html = f" <a href='{source}' style='color:#888;font-size:0.75em;'>{domain}</a>" if source else ""
|
| 125 |
+
st.markdown(
|
| 126 |
+
f"{tag} <span style='font-size:0.88em;'>{truncated}</span>{source_html}",
|
| 127 |
+
unsafe_allow_html=True,
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _render_section(section: dict):
|
| 132 |
+
"""Render a research section (category) with findings and gaps."""
|
| 133 |
+
label = section.get("label", section.get("category", "Unknown"))
|
| 134 |
+
findings = section.get("findings", [])
|
| 135 |
+
gaps = section.get("gaps", [])
|
| 136 |
+
synthesis_qs = section.get("synthesis_questions", [])
|
| 137 |
+
|
| 138 |
+
confirmed_count = sum(1 for f in findings if isinstance(f, dict) and f.get("confirmed"))
|
| 139 |
+
total = len(findings)
|
| 140 |
+
|
| 141 |
+
header = f"**{label}** — {total} findings"
|
| 142 |
+
if confirmed_count:
|
| 143 |
+
header += f" ({confirmed_count} official)"
|
| 144 |
+
if gaps:
|
| 145 |
+
header += f" · {len(gaps)} gaps"
|
| 146 |
+
|
| 147 |
+
with st.expander(header, expanded=False):
|
| 148 |
+
if synthesis_qs:
|
| 149 |
+
st.caption("**Key questions:** " + " · ".join(synthesis_qs))
|
| 150 |
+
st.markdown("")
|
| 151 |
+
|
| 152 |
+
for f in findings[:12]:
|
| 153 |
+
if isinstance(f, dict):
|
| 154 |
+
_render_finding(f)
|
| 155 |
+
|
| 156 |
+
if len(findings) > 12:
|
| 157 |
+
st.caption(f"... and {len(findings) - 12} more findings")
|
| 158 |
+
|
| 159 |
+
if gaps:
|
| 160 |
+
st.markdown("---")
|
| 161 |
+
for g in gaps:
|
| 162 |
+
gap_text = g.get("text", g) if isinstance(g, dict) else g
|
| 163 |
+
st.caption(f"🔍 **Gap:** {gap_text}")
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _render_agent_data(agent_key: str, data_files: list[dict]):
|
| 167 |
+
"""Render all data for one TR agent."""
|
| 168 |
+
config = AGENT_CONFIG[agent_key]
|
| 169 |
+
|
| 170 |
+
if not data_files:
|
| 171 |
+
st.info(f"No data files found in `data/{agent_key}/`. Run the agent first.")
|
| 172 |
+
return
|
| 173 |
+
|
| 174 |
+
latest = data_files[0]
|
| 175 |
+
timestamp = latest.get("timestamp", "")[:16].replace("T", " ")
|
| 176 |
+
sections = latest.get("sections", {})
|
| 177 |
+
|
| 178 |
+
st.caption(f"Latest run: {timestamp} · {len(sections)} categories · File: `{latest.get('_filename', '')}`")
|
| 179 |
+
|
| 180 |
+
for section_data in sections.values():
|
| 181 |
+
_render_section(section_data)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# ============================================================
|
| 185 |
+
# Main Entry Point
|
| 186 |
+
# ============================================================
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def render_tr_tab():
|
| 190 |
+
"""Main entry point — called from app.py."""
|
| 191 |
+
st.title("Turkey Expansion")
|
| 192 |
+
st.caption("Company formation research & funding intelligence for Gurma Turkey")
|
| 193 |
+
|
| 194 |
+
# --- Executive Summaries ---
|
| 195 |
+
summaries = _load_exec_summaries()
|
| 196 |
+
if summaries:
|
| 197 |
+
st.header("Executive Summaries")
|
| 198 |
+
for s in summaries:
|
| 199 |
+
age = datetime.now() - s["mtime"]
|
| 200 |
+
age_label = "today" if age.days == 0 else f"{age.days}d ago"
|
| 201 |
+
with st.expander(f"📋 {s['title']} ({age_label})", expanded=len(summaries) <= 2):
|
| 202 |
+
st.markdown(s["content"])
|
| 203 |
+
|
| 204 |
+
st.divider()
|
| 205 |
+
|
| 206 |
+
# --- Agent Research Data ---
|
| 207 |
+
st.header("Research Data")
|
| 208 |
+
|
| 209 |
+
agent_tabs = st.tabs([
|
| 210 |
+
f"{cfg['icon']} {cfg['label']}" for cfg in AGENT_CONFIG.values()
|
| 211 |
+
])
|
| 212 |
+
|
| 213 |
+
for tab, agent_key in zip(agent_tabs, AGENT_CONFIG.keys()):
|
| 214 |
+
with tab:
|
| 215 |
+
config = AGENT_CONFIG[agent_key]
|
| 216 |
+
st.caption(config["description"])
|
| 217 |
+
data_files = _load_tr_data(agent_key)
|
| 218 |
+
_render_agent_data(agent_key, data_files)
|