Emre Sarigöl commited on
Commit
d3a246e
·
1 Parent(s): a02dc64

Deploy GURMA.ai Dashboard - 2026-02-18 14:15

Browse files
Files changed (12) hide show
  1. Dockerfile +1 -1
  2. app.py +267 -256
  3. cli.py +340 -0
  4. config.py +99 -0
  5. extract.py +537 -0
  6. intel.py +508 -0
  7. llm.py +154 -0
  8. research.py +61 -1913
  9. search.py +305 -0
  10. sota_agent.py +850 -0
  11. tr_agents.py +480 -0
  12. tr_tab.py +218 -0
Dockerfile CHANGED
@@ -23,7 +23,7 @@ RUN pip install --no-cache-dir -r requirements.txt
23
  COPY . .
24
 
25
  # Create data directories
26
- RUN mkdir -p data/intel
27
 
28
  # Expose Streamlit port (HF Spaces expects app_port from README.md)
29
  EXPOSE 8501
 
23
  COPY . .
24
 
25
  # Create data directories
26
+ RUN mkdir -p data/intel data/tr-mali data/tr-fonlar docs
27
 
28
  # Expose Streamlit port (HF Spaces expects app_port from README.md)
29
  EXPOSE 8501
app.py CHANGED
@@ -25,7 +25,7 @@ import pandas as pd
25
  IS_HF_SPACE = os.getenv("HF_SPACE") or Path("/app/research.py").exists()
26
 
27
  if IS_HF_SPACE:
28
- # HF Space: import from same directory
29
  from research import (
30
  SearchService,
31
  CompetitorExtractor,
@@ -41,9 +41,9 @@ if IS_HF_SPACE:
41
  LLM_ENABLED,
42
  )
43
  else:
44
- # Local: add src to path and import from utils
45
  sys.path.insert(0, str(Path(__file__).parent.parent.parent))
46
- from src.utils.research import (
47
  SearchService,
48
  CompetitorExtractor,
49
  CompetitorIntelAgent,
@@ -832,225 +832,18 @@ def export_html(data: dict, research: list, date_range: str = "All time") -> str
832
 
833
 
834
  # ============================================================
835
- # Main Application
836
  # ============================================================
837
 
838
- def main():
839
- # Check access
840
- if not check_access():
841
- show_login_page()
842
- return
843
-
844
- # On HF Space, optionally hydrate runtime data from a private dataset repo.
845
- sync_status = sync_private_data_if_configured()
846
- if sync_status.get("status") == "error":
847
- st.error(f"Private data sync failed: {sync_status.get('reason', 'unknown error')}")
848
- return
849
-
850
- # --- Page Navigation ---
851
- page = st.sidebar.radio(
852
- "Navigation",
853
- ["Competitive Intel", "Model Evaluation"],
854
- index=0,
855
- key="nav_page",
856
- )
857
-
858
- if page == "Model Evaluation":
859
- if IS_HF_SPACE:
860
- from eval_tab import render_eval_tab
861
- else:
862
- from src.dashboard.eval_tab import render_eval_tab
863
- render_eval_tab()
864
- return
865
 
866
- data = load_data()
867
- research = load_research_files()
868
-
869
- # --- Sidebar ---
870
- with st.sidebar:
871
- # === RESEARCH ===
872
- st.header("Research")
873
-
874
- queries_text = st.text_area(
875
- "Queries",
876
- value=DEFAULT_QUERIES,
877
- height=150,
878
- help="Enter search queries, one per line."
879
- )
880
- queries = [q.strip() for q in queries_text.strip().split("\n") if q.strip()]
881
-
882
- # AI analysis option (only if LLM enabled)
883
- analyze_with_ai = False
884
- if LLM_ENABLED:
885
- analyze_with_ai = st.checkbox("Analyze with AI", value=True, help="Use LLM to extract strategic insights from results")
886
-
887
- if st.button(f"Run {len(queries)} searches", width="stretch", type="primary"):
888
- progress = st.progress(0, text="Starting...")
889
- success, total, failed, insights = run_expand_research(
890
- queries,
891
- progress_callback=lambda p, t: progress.progress(p, text=t),
892
- analyze_with_ai=analyze_with_ai
893
- )
894
- progress.empty()
895
-
896
- if success > 0:
897
- msg = f"{success}/{total} searches done"
898
- if insights:
899
- msg += f" + {len(insights)} AI insights"
900
- # Store insights in session state for display
901
- st.session_state.last_insights = insights
902
- st.success(msg)
903
- st.cache_data.clear()
904
- st.rerun()
905
-
906
- # Show last AI insights if any
907
- if st.session_state.get("last_insights"):
908
- with st.expander("AI Insights", expanded=True):
909
- for insight in st.session_state.last_insights:
910
- st.caption(f"• {insight}")
911
- if st.button("Clear", key="clear_insights"):
912
- del st.session_state.last_insights
913
- st.rerun()
914
-
915
- st.divider()
916
-
917
- # === DEEP INTEL ===
918
- st.header("Deep Intel")
919
- intel_company = st.selectbox("Competitor", COMPETITORS, index=0)
920
- intel_categories = st.multiselect(
921
- "Categories",
922
- options=list(DEEP_INTEL_CATEGORIES.keys()),
923
- default=list(DEEP_INTEL_CATEGORIES.keys()),
924
- format_func=lambda k: DEEP_INTEL_CATEGORIES[k]["label"],
925
- )
926
- btn_col1, btn_col2 = st.columns([3, 1])
927
- run_clicked = btn_col1.button("Run Deep Intel", width="stretch")
928
- stop_clicked = btn_col2.button("Stop", key="stop_intel", width="stretch")
929
-
930
- if stop_clicked:
931
- st.session_state["intel_stop"] = True
932
-
933
- if run_clicked:
934
- st.session_state["intel_stop"] = False
935
- agent = CompetitorIntelAgent(intel_company)
936
- total_queries = sum(
937
- len(DEEP_INTEL_CATEGORIES[c]["queries"])
938
- for c in intel_categories if c in DEEP_INTEL_CATEGORIES
939
- )
940
- progress = st.progress(0, text=f"Starting {intel_company}...")
941
- completed = [0]
942
-
943
- original_search = agent.search.search
944
- def _tracked_search(query, max_results=10, save=True):
945
- if st.session_state.get("intel_stop"):
946
- return []
947
- completed[0] += 1
948
- progress.progress(
949
- min(completed[0] / max(total_queries, 1), 0.95),
950
- text=f"[{completed[0]}/{total_queries}] {query[:40]}...",
951
- )
952
- return original_search(query, max_results=max_results, save=save)
953
- agent.search.search = _tracked_search
954
-
955
- report_path = agent.run(
956
- categories=intel_categories or None,
957
- delay=1.0,
958
- )
959
- progress.progress(1.0, text="Done!")
960
- progress.empty()
961
-
962
- stopped = st.session_state.get("intel_stop", False)
963
- findings = sum(len(s.findings) for s in agent.sections.values())
964
- gaps = sum(len(s.gaps) for s in agent.sections.values())
965
- if stopped:
966
- st.warning(f"Stopped early — {intel_company}: {findings} findings, {gaps} gaps (partial)")
967
- else:
968
- st.success(f"{intel_company}: {findings} findings, {gaps} gaps")
969
- st.session_state["intel_stop"] = False
970
- st.cache_data.clear()
971
- st.rerun()
972
-
973
- st.divider()
974
-
975
- # === DATA ===
976
- st.header("Data")
977
-
978
- date_range = st.selectbox(
979
- "Time range",
980
- ["All time", "Last 7 days", "Last 30 days", "Last 90 days"],
981
- index=0,
982
- label_visibility="collapsed"
983
- )
984
-
985
- col1, col2 = st.columns(2)
986
- if col1.button("Refresh", width="stretch", help="Re-extract from research files"):
987
- with st.spinner("..."):
988
- run_extract()
989
- st.cache_data.clear()
990
- st.rerun()
991
-
992
- if data:
993
- report = export_html(data, research, date_range)
994
- col2.download_button(
995
- "Export",
996
- report,
997
- file_name=f"report-{datetime.now().strftime('%Y%m%d')}.html",
998
- mime="text/html",
999
- width="stretch"
1000
- )
1001
-
1002
- st.divider()
1003
-
1004
- # === STATUS ===
1005
- st.caption(f"{len(research)} files · Updated {data.get('_generated', 'N/A')[:10] if data else 'never'}")
1006
-
1007
- if ACCESS_KEY and st.session_state.get("authenticated"):
1008
- if st.button("Logout", width="stretch"):
1009
- st.session_state.authenticated = False
1010
- st.session_state.admin_authenticated = False
1011
- st.query_params.pop("auth", None)
1012
- st.query_params.pop("adm", None)
1013
- st.rerun()
1014
-
1015
- # === ADMIN: Access Log ===
1016
- if ADMIN_KEY:
1017
- # Auto-authenticate from URL token
1018
- if not st.session_state.get("admin_authenticated"):
1019
- if st.query_params.get("adm") == _auth_token(ADMIN_KEY, salt="gurma_adm"):
1020
- st.session_state.admin_authenticated = True
1021
-
1022
- st.divider()
1023
- if st.session_state.get("admin_authenticated"):
1024
- access_log = load_access_log()
1025
- st.caption(f"Access log ({len(access_log)} entries)")
1026
- if access_log:
1027
- for entry in reversed(access_log[-20:]):
1028
- st.caption(f"{entry.get('timestamp', '?')} · {entry.get('ip', '?')}")
1029
- else:
1030
- st.caption("No accesses recorded yet")
1031
- else:
1032
- with st.popover("Admin"):
1033
- admin_input = st.text_input("Admin key", type="password", key="admin_key_input")
1034
- if st.button("Unlock", key="admin_unlock"):
1035
- if admin_input == ADMIN_KEY:
1036
- st.session_state.admin_authenticated = True
1037
- st.query_params["adm"] = _auth_token(ADMIN_KEY, salt="gurma_adm")
1038
- st.rerun()
1039
- else:
1040
- st.error("Invalid")
1041
-
1042
- # --- Log access ---
1043
- log_access()
1044
-
1045
- # --- Main Content ---
1046
- st.title("Rehabilitation Robotics — Competitive Landscape")
1047
-
1048
  if not data:
1049
  st.warning("No competitor data found.")
1050
  st.markdown("**First time?** Run the research pipeline to get started:")
1051
-
1052
  col_init1, col_init2 = st.columns(2)
1053
-
1054
  if col_init1.button("Quick Start (10 searches)", type="primary", width="stretch"):
1055
  with st.spinner("Running core competitor searches..."):
1056
  core_queries = [
@@ -1071,15 +864,15 @@ def main():
1071
  progress.progress((i + 1) / (len(core_queries) + 1), f"Searching: {q[:30]}...")
1072
  if run_search(q):
1073
  success += 1
1074
-
1075
  progress.progress(1.0, "Extracting data...")
1076
  run_extract()
1077
  progress.empty()
1078
-
1079
  st.success(f"Done! {success}/{len(core_queries)} searches completed.")
1080
  st.cache_data.clear()
1081
  st.rerun()
1082
-
1083
  if col_init2.button("Full Research (47 searches)", width="stretch"):
1084
  with st.spinner("Running full competitor research..."):
1085
  queries = []
@@ -1087,45 +880,45 @@ def main():
1087
  for template in BATCH_QUERY_TEMPLATES:
1088
  queries.append(template.format(company=company))
1089
  queries.extend(MARKET_QUERIES)
1090
-
1091
  progress = st.progress(0, "Starting...")
1092
  success = 0
1093
  for i, q in enumerate(queries):
1094
  progress.progress((i + 1) / (len(queries) + 1), f"[{i+1}/{len(queries)}] {q[:30]}...")
1095
  if run_search(q):
1096
  success += 1
1097
-
1098
  progress.progress(1.0, "Extracting data...")
1099
  run_extract()
1100
  progress.empty()
1101
-
1102
  st.success(f"Done! {success}/{len(queries)} searches completed.")
1103
  st.cache_data.clear()
1104
  st.rerun()
1105
-
1106
  return
1107
-
1108
  competitors = data.get("competitors", [])
1109
  market = data.get("market", {})
1110
-
1111
  # ===== MARKET & OPPORTUNITY =====
1112
  col_market, col_opp = st.columns([1, 1])
1113
-
1114
  with col_market:
1115
  st.markdown("### Market")
1116
-
1117
  size_2024 = market.get('size_2024', 2e9)
1118
  size_2029 = market.get('size_2029_ai', 9.1e9)
1119
  cagr = market.get('cagr', 0.278)
1120
-
1121
  m1, m2, m3 = st.columns(3)
1122
  m1.metric("2024 Market", f"${size_2024/1e9:.1f}B")
1123
  m2.metric("2029 AI Segment", f"${size_2029/1e9:.1f}B")
1124
  m3.metric("CAGR", f"{cagr*100:.1f}%")
1125
-
1126
  growth_pct = min((size_2029 / size_2024 - 1) * 100, 400)
1127
  st.progress(growth_pct / 400, text=f"{growth_pct:.0f}% projected growth (2024→2029)")
1128
-
1129
  with col_opp:
1130
  opportunity = data.get("opportunity", {})
1131
  headline = opportunity.get("headline", "Market opportunity detected")
@@ -1133,18 +926,16 @@ def main():
1133
  confirmed = opportunity.get("confirmed", False)
1134
  update_available = opportunity.get("update_available", False)
1135
  detected_at = opportunity.get("detected_at", "")
1136
-
1137
  if confirmed:
1138
  badge = f"<span style='color: #2ecc71;'>● Confirmed {opportunity.get('confirmed_at', detected_at)}</span>"
1139
  elif update_available:
1140
  badge = "<span style='color: #e67e22;'>● Update available</span>"
1141
  else:
1142
  badge = f"<span style='color: #3498db;'>● Auto-detected {detected_at}</span>"
1143
-
1144
- # Source indicators
1145
  sources = opportunity.get("sources", [])
1146
  if not sources:
1147
- # Backward compat with old intel_sourced/llm_synthesized booleans
1148
  if opportunity.get("intel_sourced"):
1149
  sources.append("intel")
1150
  if opportunity.get("llm_synthesized"):
@@ -1155,9 +946,9 @@ def main():
1155
  for s in sources if s in badge_labels
1156
  ]
1157
  source_html = " ".join(source_tags)
1158
-
1159
  points_html = "".join(f"<li>{p}</li>" for p in points[:4])
1160
-
1161
  st.markdown(f"""
1162
  <div style="
1163
  background: linear-gradient(135deg, #1a472a 0%, #2d5a3c 100%);
@@ -1174,7 +965,7 @@ def main():
1174
  </ul>
1175
  </div>
1176
  """, unsafe_allow_html=True)
1177
-
1178
  opp_col1, opp_col2 = st.columns(2)
1179
  if not confirmed:
1180
  if opp_col1.button("Confirm", key="confirm_opp", width="stretch"):
@@ -1186,19 +977,19 @@ def main():
1186
  apply_opportunity_update()
1187
  st.cache_data.clear()
1188
  st.rerun()
1189
-
1190
  st.divider()
1191
-
1192
  # ===== COMPETITOR CARDS =====
1193
  st.header("Competitors")
1194
-
1195
  sorted_competitors = sorted(competitors, key=lambda x: x.get("mentions", 0), reverse=True)
1196
-
1197
  legend_items = [f"<span style='color: {v['color']};'>●</span> {v['label']}" for k, v in STATUS_CONFIG.items() if k != "unknown"]
1198
  st.markdown(" &nbsp;|&nbsp; ".join(legend_items), unsafe_allow_html=True)
1199
-
1200
  col1, col2 = st.columns(2)
1201
-
1202
  for i, comp in enumerate(sorted_competitors):
1203
  with col1 if i % 2 == 0 else col2:
1204
  status = comp.get("status", "unknown")
@@ -1206,7 +997,7 @@ def main():
1206
  color = status_color(status)
1207
  label = status_label(status)
1208
  mentions = comp.get("mentions", 0)
1209
-
1210
  st.markdown(f"""
1211
  <div style="
1212
  border: 1px solid {color}40;
@@ -1232,37 +1023,37 @@ def main():
1232
  </div>
1233
  </div>
1234
  """, unsafe_allow_html=True)
1235
-
1236
  with st.expander("Details", expanded=False):
1237
  m1, m2 = st.columns(2)
1238
  if comp.get("stock"):
1239
  m1.metric("Stock", f"${comp['stock']:.2f}")
1240
  if comp.get("funding"):
1241
  m2.metric("Funding", f"${comp['funding']/1e6:.0f}M")
1242
-
1243
  if comp.get("notes"):
1244
  st.caption(comp["notes"][:200] + "..." if len(comp.get("notes", "")) > 200 else comp.get("notes", ""))
1245
-
1246
  events = comp.get("events", [])[:3]
1247
  if events:
1248
  st.markdown("**Recent:**")
1249
  for e in events:
1250
  st.caption(f"• {e.get('date', 'N/A')}: {e.get('event', '')[:80]}...")
1251
-
1252
  urls = comp.get("sample_urls", [])[:2]
1253
  if urls:
1254
  for url in urls:
1255
  st.markdown(f"[Source →]({url})")
1256
-
1257
  # --- Timeline ---
1258
  st.header("Timeline")
1259
-
1260
  timeline_fig = build_timeline_figure(sorted_competitors, date_range)
1261
  if timeline_fig:
1262
  st.plotly_chart(timeline_fig, width="stretch")
1263
  else:
1264
  st.info("No events in selected time range")
1265
-
1266
  # --- Deep Intel ---
1267
  intel_reports = load_intel_reports()
1268
  if intel_reports:
@@ -1320,7 +1111,7 @@ def main():
1320
  with st.expander("Recent News", expanded=False):
1321
  news_by_company = {c["name"]: [] for c in competitors}
1322
  news_by_company["Other"] = []
1323
-
1324
  for r in research:
1325
  timestamp = r.get("timestamp", "")
1326
  for result in r.get("results", []):
@@ -1330,7 +1121,7 @@ def main():
1330
  "url": result.get("url", ""),
1331
  "date": timestamp[:10] if timestamp else ""
1332
  }
1333
-
1334
  text = (item["title"] + " " + item["snippet"]).lower()
1335
  found = False
1336
  for comp in competitors:
@@ -1341,13 +1132,13 @@ def main():
1341
  break
1342
  if not found:
1343
  news_by_company["Other"].append(item)
1344
-
1345
  company_options = ["All"] + [c["name"] for c in competitors if news_by_company.get(c["name"])]
1346
  company_filter = st.selectbox("Filter by company", company_options, index=0)
1347
-
1348
  displayed = 0
1349
  max_display = 12
1350
-
1351
  if company_filter == "All":
1352
  active_companies = [c for c in competitors if news_by_company.get(c["name"])]
1353
  per_company = max(2, max_display // len(active_companies)) if active_companies else 0
@@ -1363,10 +1154,230 @@ def main():
1363
  for item in items[:max_display]:
1364
  _render_news_item(item, company_filter)
1365
  displayed += 1
1366
-
1367
  if displayed == 0:
1368
  st.info("No news found. Run some searches!")
1369
 
1370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1371
  if __name__ == "__main__":
1372
  main()
 
25
  IS_HF_SPACE = os.getenv("HF_SPACE") or Path("/app/research.py").exists()
26
 
27
  if IS_HF_SPACE:
28
+ # HF Space: research.py shim re-exports everything
29
  from research import (
30
  SearchService,
31
  CompetitorExtractor,
 
41
  LLM_ENABLED,
42
  )
43
  else:
44
+ # Local: import via package __init__
45
  sys.path.insert(0, str(Path(__file__).parent.parent.parent))
46
+ from src.utils import (
47
  SearchService,
48
  CompetitorExtractor,
49
  CompetitorIntelAgent,
 
832
 
833
 
834
  # ============================================================
835
+ # Competitive Intel Page
836
  # ============================================================
837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838
 
839
+ def _render_intel_page(data, research, date_range):
840
+ """Competitive intel main content — rendered inside its tab."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  if not data:
842
  st.warning("No competitor data found.")
843
  st.markdown("**First time?** Run the research pipeline to get started:")
844
+
845
  col_init1, col_init2 = st.columns(2)
846
+
847
  if col_init1.button("Quick Start (10 searches)", type="primary", width="stretch"):
848
  with st.spinner("Running core competitor searches..."):
849
  core_queries = [
 
864
  progress.progress((i + 1) / (len(core_queries) + 1), f"Searching: {q[:30]}...")
865
  if run_search(q):
866
  success += 1
867
+
868
  progress.progress(1.0, "Extracting data...")
869
  run_extract()
870
  progress.empty()
871
+
872
  st.success(f"Done! {success}/{len(core_queries)} searches completed.")
873
  st.cache_data.clear()
874
  st.rerun()
875
+
876
  if col_init2.button("Full Research (47 searches)", width="stretch"):
877
  with st.spinner("Running full competitor research..."):
878
  queries = []
 
880
  for template in BATCH_QUERY_TEMPLATES:
881
  queries.append(template.format(company=company))
882
  queries.extend(MARKET_QUERIES)
883
+
884
  progress = st.progress(0, "Starting...")
885
  success = 0
886
  for i, q in enumerate(queries):
887
  progress.progress((i + 1) / (len(queries) + 1), f"[{i+1}/{len(queries)}] {q[:30]}...")
888
  if run_search(q):
889
  success += 1
890
+
891
  progress.progress(1.0, "Extracting data...")
892
  run_extract()
893
  progress.empty()
894
+
895
  st.success(f"Done! {success}/{len(queries)} searches completed.")
896
  st.cache_data.clear()
897
  st.rerun()
898
+
899
  return
900
+
901
  competitors = data.get("competitors", [])
902
  market = data.get("market", {})
903
+
904
  # ===== MARKET & OPPORTUNITY =====
905
  col_market, col_opp = st.columns([1, 1])
906
+
907
  with col_market:
908
  st.markdown("### Market")
909
+
910
  size_2024 = market.get('size_2024', 2e9)
911
  size_2029 = market.get('size_2029_ai', 9.1e9)
912
  cagr = market.get('cagr', 0.278)
913
+
914
  m1, m2, m3 = st.columns(3)
915
  m1.metric("2024 Market", f"${size_2024/1e9:.1f}B")
916
  m2.metric("2029 AI Segment", f"${size_2029/1e9:.1f}B")
917
  m3.metric("CAGR", f"{cagr*100:.1f}%")
918
+
919
  growth_pct = min((size_2029 / size_2024 - 1) * 100, 400)
920
  st.progress(growth_pct / 400, text=f"{growth_pct:.0f}% projected growth (2024→2029)")
921
+
922
  with col_opp:
923
  opportunity = data.get("opportunity", {})
924
  headline = opportunity.get("headline", "Market opportunity detected")
 
926
  confirmed = opportunity.get("confirmed", False)
927
  update_available = opportunity.get("update_available", False)
928
  detected_at = opportunity.get("detected_at", "")
929
+
930
  if confirmed:
931
  badge = f"<span style='color: #2ecc71;'>● Confirmed {opportunity.get('confirmed_at', detected_at)}</span>"
932
  elif update_available:
933
  badge = "<span style='color: #e67e22;'>● Update available</span>"
934
  else:
935
  badge = f"<span style='color: #3498db;'>● Auto-detected {detected_at}</span>"
936
+
 
937
  sources = opportunity.get("sources", [])
938
  if not sources:
 
939
  if opportunity.get("intel_sourced"):
940
  sources.append("intel")
941
  if opportunity.get("llm_synthesized"):
 
946
  for s in sources if s in badge_labels
947
  ]
948
  source_html = " ".join(source_tags)
949
+
950
  points_html = "".join(f"<li>{p}</li>" for p in points[:4])
951
+
952
  st.markdown(f"""
953
  <div style="
954
  background: linear-gradient(135deg, #1a472a 0%, #2d5a3c 100%);
 
965
  </ul>
966
  </div>
967
  """, unsafe_allow_html=True)
968
+
969
  opp_col1, opp_col2 = st.columns(2)
970
  if not confirmed:
971
  if opp_col1.button("Confirm", key="confirm_opp", width="stretch"):
 
977
  apply_opportunity_update()
978
  st.cache_data.clear()
979
  st.rerun()
980
+
981
  st.divider()
982
+
983
  # ===== COMPETITOR CARDS =====
984
  st.header("Competitors")
985
+
986
  sorted_competitors = sorted(competitors, key=lambda x: x.get("mentions", 0), reverse=True)
987
+
988
  legend_items = [f"<span style='color: {v['color']};'>●</span> {v['label']}" for k, v in STATUS_CONFIG.items() if k != "unknown"]
989
  st.markdown(" &nbsp;|&nbsp; ".join(legend_items), unsafe_allow_html=True)
990
+
991
  col1, col2 = st.columns(2)
992
+
993
  for i, comp in enumerate(sorted_competitors):
994
  with col1 if i % 2 == 0 else col2:
995
  status = comp.get("status", "unknown")
 
997
  color = status_color(status)
998
  label = status_label(status)
999
  mentions = comp.get("mentions", 0)
1000
+
1001
  st.markdown(f"""
1002
  <div style="
1003
  border: 1px solid {color}40;
 
1023
  </div>
1024
  </div>
1025
  """, unsafe_allow_html=True)
1026
+
1027
  with st.expander("Details", expanded=False):
1028
  m1, m2 = st.columns(2)
1029
  if comp.get("stock"):
1030
  m1.metric("Stock", f"${comp['stock']:.2f}")
1031
  if comp.get("funding"):
1032
  m2.metric("Funding", f"${comp['funding']/1e6:.0f}M")
1033
+
1034
  if comp.get("notes"):
1035
  st.caption(comp["notes"][:200] + "..." if len(comp.get("notes", "")) > 200 else comp.get("notes", ""))
1036
+
1037
  events = comp.get("events", [])[:3]
1038
  if events:
1039
  st.markdown("**Recent:**")
1040
  for e in events:
1041
  st.caption(f"• {e.get('date', 'N/A')}: {e.get('event', '')[:80]}...")
1042
+
1043
  urls = comp.get("sample_urls", [])[:2]
1044
  if urls:
1045
  for url in urls:
1046
  st.markdown(f"[Source →]({url})")
1047
+
1048
  # --- Timeline ---
1049
  st.header("Timeline")
1050
+
1051
  timeline_fig = build_timeline_figure(sorted_competitors, date_range)
1052
  if timeline_fig:
1053
  st.plotly_chart(timeline_fig, width="stretch")
1054
  else:
1055
  st.info("No events in selected time range")
1056
+
1057
  # --- Deep Intel ---
1058
  intel_reports = load_intel_reports()
1059
  if intel_reports:
 
1111
  with st.expander("Recent News", expanded=False):
1112
  news_by_company = {c["name"]: [] for c in competitors}
1113
  news_by_company["Other"] = []
1114
+
1115
  for r in research:
1116
  timestamp = r.get("timestamp", "")
1117
  for result in r.get("results", []):
 
1121
  "url": result.get("url", ""),
1122
  "date": timestamp[:10] if timestamp else ""
1123
  }
1124
+
1125
  text = (item["title"] + " " + item["snippet"]).lower()
1126
  found = False
1127
  for comp in competitors:
 
1132
  break
1133
  if not found:
1134
  news_by_company["Other"].append(item)
1135
+
1136
  company_options = ["All"] + [c["name"] for c in competitors if news_by_company.get(c["name"])]
1137
  company_filter = st.selectbox("Filter by company", company_options, index=0)
1138
+
1139
  displayed = 0
1140
  max_display = 12
1141
+
1142
  if company_filter == "All":
1143
  active_companies = [c for c in competitors if news_by_company.get(c["name"])]
1144
  per_company = max(2, max_display // len(active_companies)) if active_companies else 0
 
1154
  for item in items[:max_display]:
1155
  _render_news_item(item, company_filter)
1156
  displayed += 1
1157
+
1158
  if displayed == 0:
1159
  st.info("No news found. Run some searches!")
1160
 
1161
 
1162
+ # ============================================================
1163
+ # Main Application
1164
+ # ============================================================
1165
+
1166
+ def main():
1167
+ # Check access
1168
+ if not check_access():
1169
+ show_login_page()
1170
+ return
1171
+
1172
+ # On HF Space, optionally hydrate runtime data from a private dataset repo.
1173
+ sync_status = sync_private_data_if_configured()
1174
+ if sync_status.get("status") == "error":
1175
+ st.error(f"Private data sync failed: {sync_status.get('reason', 'unknown error')}")
1176
+ return
1177
+
1178
+ data = load_data()
1179
+ research = load_research_files()
1180
+
1181
+ # --- Sidebar (Competitive Intel controls) ---
1182
+ with st.sidebar:
1183
+ # === RESEARCH ===
1184
+ st.header("Research")
1185
+
1186
+ queries_text = st.text_area(
1187
+ "Queries",
1188
+ value=DEFAULT_QUERIES,
1189
+ height=150,
1190
+ help="Enter search queries, one per line."
1191
+ )
1192
+ queries = [q.strip() for q in queries_text.strip().split("\n") if q.strip()]
1193
+
1194
+ # AI analysis option (only if LLM enabled)
1195
+ analyze_with_ai = False
1196
+ if LLM_ENABLED:
1197
+ analyze_with_ai = st.checkbox("Analyze with AI", value=True, help="Use LLM to extract strategic insights from results")
1198
+
1199
+ if st.button(f"Run {len(queries)} searches", width="stretch", type="primary"):
1200
+ progress = st.progress(0, text="Starting...")
1201
+ success, total, failed, insights = run_expand_research(
1202
+ queries,
1203
+ progress_callback=lambda p, t: progress.progress(p, text=t),
1204
+ analyze_with_ai=analyze_with_ai
1205
+ )
1206
+ progress.empty()
1207
+
1208
+ if success > 0:
1209
+ msg = f"{success}/{total} searches done"
1210
+ if insights:
1211
+ msg += f" + {len(insights)} AI insights"
1212
+ # Store insights in session state for display
1213
+ st.session_state.last_insights = insights
1214
+ st.success(msg)
1215
+ st.cache_data.clear()
1216
+ st.rerun()
1217
+
1218
+ # Show last AI insights if any
1219
+ if st.session_state.get("last_insights"):
1220
+ with st.expander("AI Insights", expanded=True):
1221
+ for insight in st.session_state.last_insights:
1222
+ st.caption(f"• {insight}")
1223
+ if st.button("Clear", key="clear_insights"):
1224
+ del st.session_state.last_insights
1225
+ st.rerun()
1226
+
1227
+ st.divider()
1228
+
1229
+ # === DEEP INTEL ===
1230
+ st.header("Deep Intel")
1231
+ intel_company = st.selectbox("Competitor", COMPETITORS, index=0)
1232
+ intel_categories = st.multiselect(
1233
+ "Categories",
1234
+ options=list(DEEP_INTEL_CATEGORIES.keys()),
1235
+ default=list(DEEP_INTEL_CATEGORIES.keys()),
1236
+ format_func=lambda k: DEEP_INTEL_CATEGORIES[k]["label"],
1237
+ )
1238
+ btn_col1, btn_col2 = st.columns([3, 1])
1239
+ run_clicked = btn_col1.button("Run Deep Intel", width="stretch")
1240
+ stop_clicked = btn_col2.button("Stop", key="stop_intel", width="stretch")
1241
+
1242
+ if stop_clicked:
1243
+ st.session_state["intel_stop"] = True
1244
+
1245
+ if run_clicked:
1246
+ st.session_state["intel_stop"] = False
1247
+ agent = CompetitorIntelAgent(intel_company)
1248
+ total_queries = sum(
1249
+ len(DEEP_INTEL_CATEGORIES[c]["queries"])
1250
+ for c in intel_categories if c in DEEP_INTEL_CATEGORIES
1251
+ )
1252
+ progress = st.progress(0, text=f"Starting {intel_company}...")
1253
+ completed = [0]
1254
+
1255
+ original_search = agent.search.search
1256
+ def _tracked_search(query, max_results=10, save=True):
1257
+ if st.session_state.get("intel_stop"):
1258
+ return []
1259
+ completed[0] += 1
1260
+ progress.progress(
1261
+ min(completed[0] / max(total_queries, 1), 0.95),
1262
+ text=f"[{completed[0]}/{total_queries}] {query[:40]}...",
1263
+ )
1264
+ return original_search(query, max_results=max_results, save=save)
1265
+ agent.search.search = _tracked_search
1266
+
1267
+ report_path = agent.run(
1268
+ categories=intel_categories or None,
1269
+ delay=1.0,
1270
+ )
1271
+ progress.progress(1.0, text="Done!")
1272
+ progress.empty()
1273
+
1274
+ stopped = st.session_state.get("intel_stop", False)
1275
+ findings = sum(len(s.findings) for s in agent.sections.values())
1276
+ gaps = sum(len(s.gaps) for s in agent.sections.values())
1277
+ if stopped:
1278
+ st.warning(f"Stopped early — {intel_company}: {findings} findings, {gaps} gaps (partial)")
1279
+ else:
1280
+ st.success(f"{intel_company}: {findings} findings, {gaps} gaps")
1281
+ st.session_state["intel_stop"] = False
1282
+ st.cache_data.clear()
1283
+ st.rerun()
1284
+
1285
+ st.divider()
1286
+
1287
+ # === DATA ===
1288
+ st.header("Data")
1289
+
1290
+ date_range = st.selectbox(
1291
+ "Time range",
1292
+ ["All time", "Last 7 days", "Last 30 days", "Last 90 days"],
1293
+ index=0,
1294
+ label_visibility="collapsed"
1295
+ )
1296
+
1297
+ col1, col2 = st.columns(2)
1298
+ if col1.button("Refresh", width="stretch", help="Re-extract from research files"):
1299
+ with st.spinner("..."):
1300
+ run_extract()
1301
+ st.cache_data.clear()
1302
+ st.rerun()
1303
+
1304
+ if data:
1305
+ report = export_html(data, research, date_range)
1306
+ col2.download_button(
1307
+ "Export",
1308
+ report,
1309
+ file_name=f"report-{datetime.now().strftime('%Y%m%d')}.html",
1310
+ mime="text/html",
1311
+ width="stretch"
1312
+ )
1313
+
1314
+ st.divider()
1315
+
1316
+ # === STATUS ===
1317
+ st.caption(f"{len(research)} files · Updated {data.get('_generated', 'N/A')[:10] if data else 'never'}")
1318
+
1319
+ if ACCESS_KEY and st.session_state.get("authenticated"):
1320
+ if st.button("Logout", width="stretch"):
1321
+ st.session_state.authenticated = False
1322
+ st.session_state.admin_authenticated = False
1323
+ st.query_params.pop("auth", None)
1324
+ st.query_params.pop("adm", None)
1325
+ st.rerun()
1326
+
1327
+ # === ADMIN: Access Log ===
1328
+ if ADMIN_KEY:
1329
+ # Auto-authenticate from URL token
1330
+ if not st.session_state.get("admin_authenticated"):
1331
+ if st.query_params.get("adm") == _auth_token(ADMIN_KEY, salt="gurma_adm"):
1332
+ st.session_state.admin_authenticated = True
1333
+
1334
+ st.divider()
1335
+ if st.session_state.get("admin_authenticated"):
1336
+ access_log = load_access_log()
1337
+ st.caption(f"Access log ({len(access_log)} entries)")
1338
+ if access_log:
1339
+ for entry in reversed(access_log[-20:]):
1340
+ st.caption(f"{entry.get('timestamp', '?')} · {entry.get('ip', '?')}")
1341
+ else:
1342
+ st.caption("No accesses recorded yet")
1343
+ else:
1344
+ with st.popover("Admin"):
1345
+ admin_input = st.text_input("Admin key", type="password", key="admin_key_input")
1346
+ if st.button("Unlock", key="admin_unlock"):
1347
+ if admin_input == ADMIN_KEY:
1348
+ st.session_state.admin_authenticated = True
1349
+ st.query_params["adm"] = _auth_token(ADMIN_KEY, salt="gurma_adm")
1350
+ st.rerun()
1351
+ else:
1352
+ st.error("Invalid")
1353
+
1354
+ # --- Log access ---
1355
+ log_access()
1356
+
1357
+ # --- Main Content (Tabs) ---
1358
+ tab_intel, tab_eval, tab_tr = st.tabs([
1359
+ "Competitive Intel",
1360
+ "Model Evaluation",
1361
+ "Turkey Expansion",
1362
+ ])
1363
+
1364
+ with tab_intel:
1365
+ _render_intel_page(data, research, date_range)
1366
+
1367
+ with tab_eval:
1368
+ if IS_HF_SPACE:
1369
+ from eval_tab import render_eval_tab
1370
+ else:
1371
+ from src.dashboard.eval_tab import render_eval_tab
1372
+ render_eval_tab()
1373
+
1374
+ with tab_tr:
1375
+ if IS_HF_SPACE:
1376
+ from tr_tab import render_tr_tab
1377
+ else:
1378
+ from src.dashboard.tr_tab import render_tr_tab
1379
+ render_tr_tab()
1380
+
1381
+
1382
  if __name__ == "__main__":
1383
  main()
cli.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GURMA.ai Research Tool — CLI entry point.
4
+
5
+ Usage:
6
+ python research.py search "rehabilitation robotics market"
7
+ python research.py batch
8
+ python research.py competitor "Ekso Bionics"
9
+ python research.py competitor --list-categories
10
+ python research.py extract
11
+ python research.py list
12
+ python research.py sota
13
+ python research.py sota --analyze notes/research/podcast.md
14
+ python research.py mali
15
+ python research.py fonlar -c tubitak
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import sys
22
+
23
+ try:
24
+ from .config import RESEARCH_DIR, COMPETITORS, BATCH_QUERY_TEMPLATES, MARKET_QUERIES, LLM_ENABLED
25
+ from .search import SearchService, ResultStorage
26
+ from .extract import CompetitorExtractor
27
+ from .intel import CompetitorIntelAgent, DEEP_INTEL_CATEGORIES
28
+ except ImportError:
29
+ from config import RESEARCH_DIR, COMPETITORS, BATCH_QUERY_TEMPLATES, MARKET_QUERIES, LLM_ENABLED
30
+ from search import SearchService, ResultStorage
31
+ from extract import CompetitorExtractor
32
+ from intel import CompetitorIntelAgent, DEEP_INTEL_CATEGORIES
33
+
34
+
35
+ # ============================================================
36
+ # Commands
37
+ # ============================================================
38
+
39
+ def cmd_search(args):
40
+ service = SearchService(backend=args.backend)
41
+ print(f"Searching: {args.query}")
42
+ print(f"Backend: {args.backend} | Max: {args.max_results}")
43
+ print("-" * 50)
44
+
45
+ results = service.search(args.query, args.max_results, save=args.save)
46
+
47
+ for i, r in enumerate(results, 1):
48
+ print(f"\n{i}. {r.title}")
49
+ print(f" {r.url}")
50
+ print(f" {r.snippet[:150]}...")
51
+
52
+ print(f"\n[{len(results)} results]")
53
+ if args.save:
54
+ print(f"Saved to: {RESEARCH_DIR}")
55
+
56
+
57
+ def cmd_batch(args):
58
+ service = SearchService(backend=args.backend)
59
+ storage = ResultStorage()
60
+
61
+ queries = []
62
+ for company in COMPETITORS:
63
+ for template in BATCH_QUERY_TEMPLATES:
64
+ queries.append(template.format(company=company))
65
+ queries.extend(MARKET_QUERIES)
66
+
67
+ total_queries = len(queries)
68
+
69
+ skipped = 0
70
+ if not args.force:
71
+ recent = storage.get_recent_queries(days=args.days)
72
+ original_count = len(queries)
73
+ queries = [q for q in queries if q.lower().strip() not in recent]
74
+ skipped = original_count - len(queries)
75
+
76
+ print(f"Batch Research")
77
+ print(f"{'='*60}")
78
+ print(f"Competitors: {len(COMPETITORS)}")
79
+ print(f"Total queries: {total_queries}")
80
+ if skipped > 0:
81
+ print(f"Skipped (run in last {args.days} days): {skipped}")
82
+ print(f"New queries to run: {len(queries)}")
83
+ print(f"Output: {RESEARCH_DIR}")
84
+ print(f"{'='*60}")
85
+
86
+ if not queries:
87
+ print("\nNo new queries to run. Use --force to re-run all.")
88
+ return
89
+
90
+ def progress(i, total, query):
91
+ print(f"\n[{i}/{total}] {query}")
92
+
93
+ stats = service.search_batch(queries, args.max_results, args.delay, callback=progress)
94
+
95
+ success = sum(1 for v in stats.values() if v >= 0)
96
+ print(f"\n{'='*60}")
97
+ print(f"Complete: {success}/{len(queries)} successful")
98
+ if skipped > 0:
99
+ print(f"Skipped: {skipped} (already run recently)")
100
+ print(f"{'='*60}")
101
+
102
+
103
+ def cmd_competitor(args):
104
+ company = args.company
105
+ use_external_llm = args.external_llm
106
+
107
+ if use_external_llm and not LLM_ENABLED:
108
+ print("Warning: --external-llm requested but OPENROUTER_API_KEY not found. Skipping external LLM.")
109
+ use_external_llm = False
110
+
111
+ categories = None
112
+ if args.categories:
113
+ categories = [c.strip() for c in args.categories.split(",")]
114
+ valid = set(DEEP_INTEL_CATEGORIES.keys())
115
+ invalid = [c for c in categories if c not in valid]
116
+ if invalid:
117
+ print(f"Invalid categories: {invalid}")
118
+ print(f"Valid: {sorted(valid)}")
119
+ return
120
+
121
+ if args.list_categories:
122
+ print("Available categories:")
123
+ for key, cat in DEEP_INTEL_CATEGORIES.items():
124
+ q_count = len(cat["queries"])
125
+ print(f" {key:30s} {cat['label']:30s} ({q_count} queries)")
126
+ return
127
+
128
+ agent = CompetitorIntelAgent(company)
129
+ report_path = agent.run(
130
+ categories=categories,
131
+ use_external_llm=use_external_llm,
132
+ delay=args.delay,
133
+ max_results=args.max_results,
134
+ )
135
+
136
+ print(f"\nReport: {report_path}")
137
+
138
+
139
+ def cmd_extract(args):
140
+ extractor = CompetitorExtractor()
141
+
142
+ print(f"Loading research from: {extractor.research_dir}")
143
+ data = extractor.process()
144
+
145
+ if not data["competitors"]:
146
+ print("No research files found. Run 'batch' first.")
147
+ return
148
+
149
+ output = extractor.save(data)
150
+
151
+ print(f"Saved to: {output}")
152
+ print(f"\nCompany mentions:")
153
+ for comp in data["competitors"]:
154
+ status_marker = {"collapsed": "⚠", "weak": "↓", "growing": "↑", "strong": "★"}.get(comp["status"], "•")
155
+ print(f" {status_marker} {comp['name']}: {comp['mentions']} mentions ({comp['status']})")
156
+
157
+
158
+ def cmd_sota(args):
159
+ try:
160
+ from .sota_agent import SOTAScoutAgent
161
+ except ImportError:
162
+ from sota_agent import SOTAScoutAgent
163
+
164
+ agent = SOTAScoutAgent()
165
+
166
+ if args.analyze:
167
+ report = agent.analyze(args.analyze)
168
+ print(f"\nAnalysis report: {report}")
169
+ return
170
+
171
+ agent.show(section=args.show)
172
+
173
+
174
+ def cmd_mali(args):
175
+ try:
176
+ from .tr_agents import MaliMusavirAgent
177
+ except ImportError:
178
+ from tr_agents import MaliMusavirAgent
179
+
180
+ agent = MaliMusavirAgent()
181
+
182
+ if args.list_categories:
183
+ agent.list_categories()
184
+ return
185
+
186
+ categories = None
187
+ if args.categories:
188
+ categories = [c.strip() for c in args.categories.split(",")]
189
+ valid = set(agent.CATEGORIES.keys())
190
+ invalid = [c for c in categories if c not in valid]
191
+ if invalid:
192
+ print(f"Geçersiz kategoriler: {invalid}")
193
+ print(f"Geçerli: {sorted(valid)}")
194
+ return
195
+
196
+ report_path = agent.run(
197
+ categories=categories,
198
+ delay=args.delay,
199
+ max_results=args.max_results,
200
+ )
201
+ print(f"\nRapor: {report_path}")
202
+
203
+
204
+ def cmd_fonlar(args):
205
+ try:
206
+ from .tr_agents import FonArastirmaAgent
207
+ except ImportError:
208
+ from tr_agents import FonArastirmaAgent
209
+
210
+ agent = FonArastirmaAgent()
211
+
212
+ if args.list_categories:
213
+ agent.list_categories()
214
+ return
215
+
216
+ categories = None
217
+ if args.categories:
218
+ categories = [c.strip() for c in args.categories.split(",")]
219
+ valid = set(agent.CATEGORIES.keys())
220
+ invalid = [c for c in categories if c not in valid]
221
+ if invalid:
222
+ print(f"Geçersiz kategoriler: {invalid}")
223
+ print(f"Geçerli: {sorted(valid)}")
224
+ return
225
+
226
+ report_path = agent.run(
227
+ categories=categories,
228
+ delay=args.delay,
229
+ max_results=args.max_results,
230
+ )
231
+ print(f"\nRapor: {report_path}")
232
+
233
+
234
+ def cmd_list(args):
235
+ storage = ResultStorage()
236
+ searches = storage.list_searches(args.limit)
237
+
238
+ if not searches:
239
+ print(f"No searches in {RESEARCH_DIR}")
240
+ return
241
+
242
+ print(f"Recent searches ({RESEARCH_DIR}):\n")
243
+ for s in searches:
244
+ print(f" {s['timestamp'][:10]} {s['results']:2d} results {s['query'][:50]}")
245
+
246
+
247
+ # ============================================================
248
+ # Argparse
249
+ # ============================================================
250
+
251
+ def main():
252
+ parser = argparse.ArgumentParser(
253
+ description="GURMA.ai Research Tool",
254
+ formatter_class=argparse.RawDescriptionHelpFormatter
255
+ )
256
+ subparsers = parser.add_subparsers(dest="command", help="Commands")
257
+
258
+ # search
259
+ p_search = subparsers.add_parser("search", help="Single web search")
260
+ p_search.add_argument("query", help="Search query")
261
+ p_search.add_argument("-b", "--backend", default="duckduckgo",
262
+ choices=["duckduckgo", "ddg", "serpapi", "brave"])
263
+ p_search.add_argument("-n", "--max-results", type=int, default=10)
264
+ p_search.add_argument("--no-save", dest="save", action="store_false")
265
+ p_search.set_defaults(func=cmd_search)
266
+
267
+ # batch
268
+ p_batch = subparsers.add_parser("batch", help="Batch research all competitors")
269
+ p_batch.add_argument("-b", "--backend", default="duckduckgo")
270
+ p_batch.add_argument("-n", "--max-results", type=int, default=10)
271
+ p_batch.add_argument("-d", "--delay", type=float, default=0.5)
272
+ p_batch.add_argument("--days", type=int, default=7,
273
+ help="Skip queries run within N days (default: 7)")
274
+ p_batch.add_argument("-f", "--force", action="store_true",
275
+ help="Force re-run all queries (ignore deduplication)")
276
+ p_batch.set_defaults(func=cmd_batch)
277
+
278
+ # competitor (deep intel)
279
+ p_comp = subparsers.add_parser("competitor", help="Deep competitive intelligence on a company")
280
+ p_comp.add_argument("company", nargs="?", default="", help="Company name (e.g. 'Ekso Bionics')")
281
+ p_comp.add_argument("--external-llm", action="store_true",
282
+ help="Also use external LLM (OpenRouter) for enhanced analysis")
283
+ p_comp.add_argument("-c", "--categories", type=str, default=None,
284
+ help="Comma-separated categories (default: all)")
285
+ p_comp.add_argument("--list-categories", action="store_true",
286
+ help="List available categories")
287
+ p_comp.add_argument("-n", "--max-results", type=int, default=10)
288
+ p_comp.add_argument("-d", "--delay", type=float, default=1.0,
289
+ help="Delay between searches in seconds (default: 1.0)")
290
+ p_comp.set_defaults(func=cmd_competitor)
291
+
292
+ # sota
293
+ p_sota = subparsers.add_parser("sota", help="SOTA technology knowledge base for GURMA.ai")
294
+ p_sota.add_argument("--analyze", "-a", type=str, default=None,
295
+ help="Analyze a document and update knowledge base")
296
+ p_sota.add_argument("--show", "-s", type=str, default=None, nargs="?",
297
+ const=None,
298
+ choices=["models", "techniques", "stack", "principles", "actions", "sources"],
299
+ help="Show specific KB section (default: summary)")
300
+ p_sota.set_defaults(func=cmd_sota)
301
+
302
+ # mali (Turkish company formation)
303
+ p_mali = subparsers.add_parser("mali", help="Türkiye şirket kuruluşu araştırması")
304
+ p_mali.add_argument("-c", "--categories", type=str, default=None,
305
+ help="Virgülle ayrılmış kategoriler (varsayılan: tümü)")
306
+ p_mali.add_argument("--list-categories", action="store_true",
307
+ help="Mevcut kategorileri listele")
308
+ p_mali.add_argument("-n", "--max-results", type=int, default=10)
309
+ p_mali.add_argument("-d", "--delay", type=float, default=1.0)
310
+ p_mali.set_defaults(func=cmd_mali)
311
+
312
+ # fonlar (Turkish government funding research)
313
+ p_fonlar = subparsers.add_parser("fonlar", help="TÜBİTAK ve devlet fonları araştırması")
314
+ p_fonlar.add_argument("-c", "--categories", type=str, default=None,
315
+ help="Virgülle ayrılmış kategoriler (varsayılan: tümü)")
316
+ p_fonlar.add_argument("--list-categories", action="store_true",
317
+ help="Mevcut kategorileri listele")
318
+ p_fonlar.add_argument("-n", "--max-results", type=int, default=10)
319
+ p_fonlar.add_argument("-d", "--delay", type=float, default=1.0)
320
+ p_fonlar.set_defaults(func=cmd_fonlar)
321
+
322
+ # extract
323
+ p_extract = subparsers.add_parser("extract", help="Extract competitor data to JSON")
324
+ p_extract.set_defaults(func=cmd_extract)
325
+
326
+ # list
327
+ p_list = subparsers.add_parser("list", help="List saved searches")
328
+ p_list.add_argument("-l", "--limit", type=int, default=20)
329
+ p_list.set_defaults(func=cmd_list)
330
+
331
+ args = parser.parse_args()
332
+
333
+ if hasattr(args, "func"):
334
+ args.func(args)
335
+ else:
336
+ parser.print_help()
337
+
338
+
339
+ if __name__ == "__main__":
340
+ main()
config.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GURMA.ai shared configuration.
3
+
4
+ Environment detection, directory paths, API keys, and research constants
5
+ used across all agents and the dashboard.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from pathlib import Path
12
+
13
+
14
+ # ============================================================
15
+ # Environment Detection
16
+ # ============================================================
17
+
18
+ def _detect_project_root() -> Path:
19
+ """Detect project root based on environment."""
20
+ if os.getenv("HF_SPACE") or Path("/app/research.py").exists():
21
+ return Path("/app")
22
+ return Path(__file__).parent.parent.parent
23
+
24
+ PROJECT_ROOT = _detect_project_root()
25
+ IS_HF_SPACE = PROJECT_ROOT == Path("/app")
26
+
27
+ if not IS_HF_SPACE:
28
+ try:
29
+ from dotenv import load_dotenv
30
+ load_dotenv(PROJECT_ROOT / ".env")
31
+ except ImportError:
32
+ pass
33
+
34
+
35
+ # ============================================================
36
+ # Directories
37
+ # ============================================================
38
+
39
+ if IS_HF_SPACE:
40
+ RESEARCH_DIR = PROJECT_ROOT / "data"
41
+ DATA_DIR = PROJECT_ROOT / "data"
42
+ else:
43
+ RESEARCH_DIR = PROJECT_ROOT / "data"
44
+ DATA_DIR = PROJECT_ROOT / "src" / "dashboard"
45
+
46
+ RESEARCH_DIR.mkdir(parents=True, exist_ok=True)
47
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
48
+
49
+
50
+ # ============================================================
51
+ # API Keys & LLM Config
52
+ # ============================================================
53
+
54
+ SERPAPI_KEY = os.getenv("SERPAPI_KEY")
55
+ BRAVE_API_KEY = os.getenv("BRAVE_API_KEY")
56
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
57
+
58
+ LLM_MODEL = "deepseek/deepseek-chat"
59
+ LLM_ENABLED = bool(OPENROUTER_API_KEY)
60
+
61
+
62
+ # ============================================================
63
+ # Research Constants
64
+ # ============================================================
65
+
66
+ COMPETITORS = [
67
+ "Hocoma", "Ekso Bionics", "Lifeward ReWalk", "Fourier Intelligence",
68
+ "Cyberdyne HAL", "Wandercraft", "Myomo", "Bionik",
69
+ ]
70
+
71
+ BATCH_QUERY_TEMPLATES = [
72
+ "{company} latest news 2025 2026",
73
+ "{company} funding investors valuation",
74
+ "{company} FDA approval regulatory",
75
+ "{company} partnerships collaborations",
76
+ "{company} AI machine learning technology",
77
+ "site:accessdata.fda.gov {company}",
78
+ "site:clinicaltrials.gov {company} rehabilitation",
79
+ "site:crunchbase.com {company}",
80
+ "site:sec.gov {company} 10-K OR 8-K",
81
+ "site:patents.google.com {company} exoskeleton OR rehabilitation",
82
+ ]
83
+
84
+ MARKET_QUERIES = [
85
+ "rehabilitation robotics market size 2026 forecast",
86
+ "exoskeleton market growth AI integration",
87
+ "rehabilitation robotics insurance reimbursement",
88
+ "medical exoskeleton FDA approval 2025",
89
+ "stroke rehabilitation AI technology",
90
+ "spinal cord injury exoskeleton treatment",
91
+ "rehabilitation robotics competitive landscape",
92
+ "site:exoskeletonreport.com 2025 2026",
93
+ "site:medgadget.com exoskeleton rehabilitation",
94
+ "site:fda.gov rehabilitation robotics guidance",
95
+ "MDR medical device regulation exoskeleton CE mark 2025",
96
+ "site:pubmed.ncbi.nlm.nih.gov rehabilitation robotics AI 2024 2025",
97
+ "exoskeleton insurance coverage CMS reimbursement code",
98
+ "rehabilitation robotics HCPCS code billing",
99
+ ]
extract.py ADDED
@@ -0,0 +1,537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Competitor data extraction and opportunity detection.
3
+
4
+ Builds competitors.json from raw research files + deep intel findings,
5
+ detects market opportunities, and optionally synthesizes via LLM.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import re
12
+ from collections import defaultdict
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ try:
18
+ from .config import RESEARCH_DIR, DATA_DIR, LLM_ENABLED
19
+ from .llm import LLMClient
20
+ except ImportError:
21
+ from config import RESEARCH_DIR, DATA_DIR, LLM_ENABLED
22
+ from llm import LLMClient
23
+
24
+
25
+ # ============================================================
26
+ # Company Definitions & Extraction Patterns
27
+ # ============================================================
28
+
29
+ COMPANY_DEFINITIONS = {
30
+ "Hocoma": {"aliases": ["hocoma", "dih", "lokomat"], "country": "Switzerland", "product": "Lokomat", "status": "collapsed", "verified": True},
31
+ "Ekso Bionics": {"aliases": ["ekso", "eksobionics", "eksonr"], "country": "USA", "product": "EksoNR", "status": "weak", "verified": True},
32
+ "Cyberdyne": {"aliases": ["cyberdyne", "hal exoskeleton"], "country": "Japan", "product": "HAL", "status": "strong", "verified": True},
33
+ "Lifeward": {"aliases": ["lifeward", "rewalk", "alterg"], "country": "Israel/USA", "product": "ReWalk 7", "status": "consolidating", "verified": True},
34
+ "Fourier": {"aliases": ["fourier", "fourier intelligence"], "country": "China", "product": "X1, M2", "status": "growing", "verified": True},
35
+ "Myomo": {"aliases": ["myomo", "myopro"], "country": "USA", "product": "MyoPro", "status": "stable", "verified": False},
36
+ "Bionik": {"aliases": ["bionik", "inmotion"], "country": "Canada", "product": "InMotion", "status": "stable", "verified": False},
37
+ "Wandercraft": {"aliases": ["wandercraft", "atalante"], "country": "France", "product": "Atalante X", "status": "growing", "verified": False},
38
+ }
39
+
40
+ STATUS_KEYWORDS = [
41
+ ("collapsed", ["bankrupt", "delisted", "suspended", "collapse", "shut down", "ceased", "nasdaq delisted"]),
42
+ ("weak", ["52-week low", "struggling", "losses", "declining", "layoffs"]),
43
+ ("growing", ["series e", "series d", "series c", "funding round", "$109 million"]),
44
+ ("consolidating", ["acquired", "merger", "acquisition"]),
45
+ ("strong", ["leader", "dominant", "profitable"]),
46
+ ]
47
+
48
+ DATE_PATTERN = re.compile(
49
+ r'((?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4})'
50
+ r'|(\d{4}-\d{2}-\d{2})'
51
+ r'|(\d{4}-\d{2})'
52
+ r'|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4})'
53
+ )
54
+ MONEY_PATTERN = re.compile(r'\$[\d,]+(?:\.\d+)?(?:\s*(?:million|billion|M|B))?|\d+(?:\.\d+)?\s*(?:million|billion)', re.IGNORECASE)
55
+
56
+
57
+ # ============================================================
58
+ # Competitor Extractor
59
+ # ============================================================
60
+
61
+ class CompetitorExtractor:
62
+ """Extract structured competitor data from research results."""
63
+
64
+ def __init__(self, research_dir: Path = RESEARCH_DIR,
65
+ output_file: Path = None):
66
+ self.research_dir = research_dir
67
+ self.output_file = output_file or (DATA_DIR / "competitors.json")
68
+
69
+ def load_research_files(self) -> list[dict]:
70
+ results = []
71
+ if not self.research_dir.exists():
72
+ return results
73
+
74
+ for json_file in self.research_dir.glob("*.json"):
75
+ if json_file.name.startswith("."):
76
+ continue
77
+ try:
78
+ with open(json_file) as f:
79
+ data = json.load(f)
80
+ data["_source_file"] = json_file.name
81
+ results.append(data)
82
+ except Exception as e:
83
+ print(f"Error loading {json_file}: {e}")
84
+
85
+ return results
86
+
87
+ def find_mentions(self, text: str) -> list[str]:
88
+ text_lower = text.lower()
89
+ mentioned = []
90
+ for company, info in COMPANY_DEFINITIONS.items():
91
+ if any(alias in text_lower for alias in info["aliases"]):
92
+ mentioned.append(company)
93
+ return mentioned
94
+
95
+ def normalize_date(self, date_str: str) -> str | None:
96
+ formats = ["%B %d, %Y", "%B %d %Y", "%b %d, %Y", "%b %d %Y", "%Y-%m-%d", "%Y-%m"]
97
+ for fmt in formats:
98
+ try:
99
+ dt = datetime.strptime(date_str.strip(), fmt)
100
+ if dt.year < 2010:
101
+ return None
102
+ return dt.strftime("%Y-%m-%d")
103
+ except:
104
+ pass
105
+ return date_str
106
+
107
+ def extract_events(self, text: str, company: str) -> list[dict]:
108
+ events = []
109
+ aliases = COMPANY_DEFINITIONS[company]["aliases"]
110
+
111
+ for match in DATE_PATTERN.finditer(text):
112
+ date_str = match.group(0)
113
+ if not date_str:
114
+ continue
115
+
116
+ start = max(0, match.start() - 50)
117
+ end = min(len(text), match.end() + 150)
118
+ context = text[start:end]
119
+
120
+ normalized = self.normalize_date(date_str)
121
+ if normalized and any(alias in context.lower() for alias in aliases):
122
+ events.append({
123
+ "date": normalized,
124
+ "context": context.strip()
125
+ })
126
+
127
+ return events
128
+
129
+ def detect_status(self, snippets: list[str], default: str) -> str:
130
+ text = " ".join(snippets).lower()
131
+ for status, keywords in STATUS_KEYWORDS:
132
+ if any(kw.lower() in text for kw in keywords):
133
+ return status
134
+ return default
135
+
136
+ def extract_stock(self, snippets: list[str]) -> Optional[float]:
137
+ for snippet in snippets:
138
+ match = re.search(r'\$(\d+\.?\d*)', snippet)
139
+ if match and float(match.group(1)) < 1000:
140
+ return float(match.group(1))
141
+ return None
142
+
143
+ def extract_funding(self, money_mentions: list[str]) -> Optional[int]:
144
+ for m in money_mentions:
145
+ match = re.search(r'(\d+)\s*(?:million|M)', m, re.IGNORECASE)
146
+ if match:
147
+ return int(match.group(1)) * 1_000_000
148
+ match = re.search(r'(\d+\.?\d*)\s*(?:billion|B)', m, re.IGNORECASE)
149
+ if match:
150
+ return int(float(match.group(1)) * 1_000_000_000)
151
+ return None
152
+
153
+ def _load_intel_findings(self) -> dict[str, list[dict]]:
154
+ """Load confirmed findings from Deep Intel reports, grouped by company."""
155
+ intel_dir = self.research_dir / "intel"
156
+ if not intel_dir.exists():
157
+ return {}
158
+
159
+ findings_by_company: dict[str, list[dict]] = {}
160
+ seen_companies: set[str] = set()
161
+
162
+ for json_file in sorted(intel_dir.glob("*_intel.json"), reverse=True):
163
+ try:
164
+ with open(json_file) as f:
165
+ data = json.load(f)
166
+ company = data.get("company", "")
167
+ if not company or company in seen_companies:
168
+ continue
169
+ seen_companies.add(company)
170
+
171
+ all_findings = []
172
+ for section in data.get("sections", {}).values():
173
+ for finding in section.get("findings", []):
174
+ if isinstance(finding, dict) and finding.get("text"):
175
+ all_findings.append(finding)
176
+ elif isinstance(finding, str) and finding:
177
+ all_findings.append({"text": finding, "confirmed": False, "source": ""})
178
+
179
+ if all_findings:
180
+ findings_by_company[company] = all_findings
181
+ except Exception:
182
+ pass
183
+
184
+ return findings_by_company
185
+
186
+ def _extract_intel_opportunities(self, intel_findings: dict[str, list[dict]]) -> list[dict]:
187
+ """Extract opportunity signals from Deep Intel confirmed findings."""
188
+ opportunities = []
189
+
190
+ vuln_patterns = [
191
+ (r'(?:layoff|restructur|downsiz|headcount.?reduc)', "workforce_cut", 2),
192
+ (r'(?:delist|stock.?(?:drop|fall|declin)|52.week.low|penny.stock)', "financial_distress", 1),
193
+ (r'(?:FDA.?(?:reject|warning|recall)|regulatory.?(?:issue|fail|delay))', "regulatory_issue", 2),
194
+ (r'(?:bankrupt|insolvenc|cease.?operat|wind.?down|liquidat)', "collapse", 1),
195
+ (r'(?:customer.?complain|negative.?review|churn|losing.?customer)', "customer_risk", 2),
196
+ (r'(?:legacy|technical.?debt|outdated|proprietary.?lock)', "tech_weakness", 3),
197
+ (r'(?:no.?AI|lack.?(?:of.?)?(?:data|machine.learn|personali))', "ai_gap", 2),
198
+ ]
199
+
200
+ for company, findings in intel_findings.items():
201
+ confirmed = [f for f in findings if f.get("confirmed")]
202
+ all_text = " ".join(f["text"] for f in confirmed).lower() if confirmed else ""
203
+ all_text_full = " ".join(f["text"] for f in findings).lower()
204
+
205
+ for pattern, opp_type, priority in vuln_patterns:
206
+ if re.search(pattern, all_text, re.IGNORECASE):
207
+ match_finding = next(
208
+ (f for f in confirmed if re.search(pattern, f["text"], re.IGNORECASE)),
209
+ None
210
+ )
211
+ if match_finding:
212
+ opportunities.append({
213
+ "type": opp_type,
214
+ "text": f"{company}: {match_finding['text'][:120]}",
215
+ "priority": priority,
216
+ "confirmed": True,
217
+ "source": match_finding.get("source", ""),
218
+ "company": company,
219
+ })
220
+ elif re.search(pattern, all_text_full, re.IGNORECASE):
221
+ match_finding = next(
222
+ (f for f in findings if re.search(pattern, f["text"], re.IGNORECASE)),
223
+ None
224
+ )
225
+ if match_finding:
226
+ opportunities.append({
227
+ "type": opp_type,
228
+ "text": f"{company}: {match_finding['text'][:120]}",
229
+ "priority": priority + 1,
230
+ "confirmed": False,
231
+ "source": match_finding.get("source", ""),
232
+ "company": company,
233
+ })
234
+
235
+ return opportunities
236
+
237
+ def _load_sota_tech_signals(self) -> list[dict]:
238
+ """Load tech advantage signals from SOTA knowledge base."""
239
+ kb_path = self.research_dir / "sota" / "knowledge_base.json"
240
+ if not kb_path.exists():
241
+ return []
242
+
243
+ try:
244
+ with open(kb_path) as f:
245
+ kb = json.load(f)
246
+ except Exception:
247
+ return []
248
+
249
+ signals = []
250
+
251
+ for t in kb.get("techniques", []):
252
+ if t.get("priority") == "high" and t.get("gurma_fit"):
253
+ signals.append({
254
+ "type": "tech_advantage",
255
+ "text": f"{t['name']}: {t['gurma_fit'][:120]}",
256
+ "priority": 2,
257
+ "confirmed": True,
258
+ "company": "GURMA",
259
+ })
260
+
261
+ for p in kb.get("key_principles", [])[:2]:
262
+ if p.get("principle"):
263
+ signals.append({
264
+ "type": "tech_principle",
265
+ "text": f"{p['principle']}: {p.get('detail', '')[:100]}",
266
+ "priority": 3,
267
+ "confirmed": True,
268
+ "company": "GURMA",
269
+ })
270
+
271
+ return signals
272
+
273
+ def _opportunity_changed(self, new_opps: list[dict], existing: dict) -> bool:
274
+ existing_points = set(existing.get("points", []))
275
+ new_points = set(o["text"] for o in new_opps[:4])
276
+
277
+ if not existing_points:
278
+ return True
279
+
280
+ new_p1_types = {o["type"] for o in new_opps if o["priority"] == 1}
281
+ old_raw = existing.get("raw_opportunities", [])
282
+ old_p1_types = {o["type"] for o in old_raw if o.get("priority") == 1}
283
+ if new_p1_types != old_p1_types:
284
+ return True
285
+
286
+ overlap = existing_points & new_points
287
+ if len(overlap) < len(existing_points) / 2:
288
+ return True
289
+
290
+ return False
291
+
292
+ def _synthesize_opportunity_llm(self, opportunities: list[dict],
293
+ competitors: list[dict]) -> Optional[dict]:
294
+ if not LLM_ENABLED:
295
+ return None
296
+
297
+ llm = LLMClient()
298
+
299
+ opp_text = "\n".join(
300
+ f"- [{o['type']}] {'[CONFIRMED]' if o.get('confirmed') else '[SPECULATIVE]'} {o['text']}"
301
+ for o in opportunities[:12]
302
+ )
303
+
304
+ comp_summary = "\n".join(
305
+ f"- {c['name']}: status={c['status']}, "
306
+ f"{'stock=$'+format(c['stock'], '.2f') if c.get('stock') else 'no stock data'}, "
307
+ f"{'funding=$'+format(c['funding']/1e6, '.0f')+'M' if c.get('funding') else 'no funding data'}"
308
+ for c in competitors[:8]
309
+ )
310
+
311
+ system = (
312
+ "You are a strategic advisor for GURMA.ai, a Swiss AI company "
313
+ "entering rehabilitation robotics with 15 years of patient outcome "
314
+ "data (not just motion data) from BAMA Teknoloji. "
315
+ "You produce concise, actionable strategic assessments."
316
+ )
317
+
318
+ prompt = f"""Based on the following competitive + technology signals and competitor data,
319
+ produce a strategic opportunity assessment for GURMA.ai.
320
+
321
+ Signals (competitive, tech advantages, and threats):
322
+ {opp_text}
323
+
324
+ Competitor landscape:
325
+ {comp_summary}
326
+
327
+ Return JSON:
328
+ {{
329
+ "headline": "One punchy sentence (max 10 words) summarizing the #1 strategic opportunity",
330
+ "points": [
331
+ "Actionable insight 1 (max 20 words, include numbers where available)",
332
+ "Actionable insight 2",
333
+ "Actionable insight 3",
334
+ "Actionable insight 4"
335
+ ]
336
+ }}
337
+
338
+ Rules:
339
+ - Headline should be about the OPPORTUNITY, not just a competitor's problem
340
+ - Points should mix competitive windows, tech advantages, AND threats
341
+ - Be specific: include dollar amounts, dates, competitor names, model/technique names
342
+ - Maximum 4 points, ranked by strategic importance
343
+ - confirmed signals should be weighted more heavily than speculative ones"""
344
+
345
+ response = llm.call(prompt, system, max_tokens=500)
346
+ if response:
347
+ match = re.search(r'\{.*\}', response, re.DOTALL)
348
+ if match:
349
+ try:
350
+ result = json.loads(match.group())
351
+ if result.get("headline") and result.get("points"):
352
+ return result
353
+ except Exception:
354
+ pass
355
+ return None
356
+
357
+ def detect_opportunities(self, competitors: list[dict], all_snippets: list[str]) -> dict:
358
+ """Detect market opportunities from competitor data + Deep Intel findings."""
359
+ opportunities = []
360
+
361
+ collapsed = [c for c in competitors if c["status"] == "collapsed"]
362
+ weak = [c for c in competitors if c["status"] == "weak"]
363
+
364
+ if collapsed:
365
+ names = ", ".join(c["name"] for c in collapsed)
366
+ opportunities.append({
367
+ "type": "market_gap",
368
+ "text": f"{names} collapsed — customers seeking alternatives",
369
+ "priority": 1, "confirmed": True, "company": names,
370
+ })
371
+
372
+ if weak:
373
+ for c in weak:
374
+ opp_text = f"{c['name']} financially weak"
375
+ if c.get("stock"):
376
+ opp_text += f" (${c['stock']:.2f})"
377
+ opp_text += " — vulnerable to disruption"
378
+ opportunities.append({
379
+ "type": "weakness",
380
+ "text": opp_text,
381
+ "priority": 2, "confirmed": True, "company": c["name"],
382
+ })
383
+
384
+ growing = [c for c in competitors if c["status"] == "growing" and c.get("funding")]
385
+ for c in growing:
386
+ funding_m = c["funding"] / 1_000_000
387
+ opportunities.append({
388
+ "type": "threat",
389
+ "text": f"{c['name']} well-funded (${funding_m:.0f}M) — monitor closely",
390
+ "priority": 3, "confirmed": True, "company": c["name"],
391
+ })
392
+
393
+ if competitors:
394
+ opportunities.append({
395
+ "type": "advantage",
396
+ "text": "BAMA has 15 years outcome data vs. competitors' motion data",
397
+ "priority": 1, "confirmed": True, "company": "BAMA",
398
+ })
399
+
400
+ intel_findings = self._load_intel_findings()
401
+ if intel_findings:
402
+ intel_opps = self._extract_intel_opportunities(intel_findings)
403
+ existing_keys = {(o.get("company", ""), o["type"]) for o in opportunities}
404
+ for io in intel_opps:
405
+ key = (io.get("company", ""), io["type"])
406
+ if key not in existing_keys:
407
+ opportunities.append(io)
408
+ existing_keys.add(key)
409
+
410
+ sota_signals = self._load_sota_tech_signals()
411
+ if sota_signals:
412
+ existing_keys = {(o.get("company", ""), o["type"]) for o in opportunities}
413
+ for ts in sota_signals:
414
+ key = (ts.get("company", ""), ts["type"])
415
+ if key not in existing_keys:
416
+ opportunities.append(ts)
417
+ existing_keys.add(key)
418
+
419
+ opportunities.sort(key=lambda x: x["priority"])
420
+
421
+ llm_result = self._synthesize_opportunity_llm(opportunities, competitors)
422
+
423
+ if llm_result:
424
+ headline = llm_result["headline"]
425
+ points = llm_result["points"][:4]
426
+ else:
427
+ if collapsed:
428
+ headline = f"{collapsed[0]['name']} collapse creates market window"
429
+ elif weak:
430
+ headline = "Competitor weakness creates opportunity"
431
+ else:
432
+ headline = "Data advantage positions GURMA.ai for growth"
433
+ points = [o["text"] for o in opportunities[:4]]
434
+
435
+ sources = ["competitor"]
436
+ if intel_findings:
437
+ sources.append("intel")
438
+ if sota_signals:
439
+ sources.append("tech")
440
+ if llm_result:
441
+ sources.append("llm")
442
+
443
+ return {
444
+ "headline": headline,
445
+ "points": points,
446
+ "detected_at": datetime.now().strftime("%Y-%m-%d"),
447
+ "raw_opportunities": opportunities,
448
+ "sources": sources,
449
+ }
450
+
451
+ def load_existing_data(self) -> Optional[dict]:
452
+ if self.output_file.exists():
453
+ try:
454
+ with open(self.output_file) as f:
455
+ return json.load(f)
456
+ except:
457
+ pass
458
+ return None
459
+
460
+ def process(self) -> dict:
461
+ research_data = self.load_research_files()
462
+ if not research_data:
463
+ return {"competitors": [], "market": {}}
464
+
465
+ company_data = defaultdict(lambda: {
466
+ "mentions": 0, "snippets": [], "events": [], "money": [], "urls": []
467
+ })
468
+
469
+ for research in research_data:
470
+ for result in research.get("results", []):
471
+ text = f"{result.get('title', '')} {result.get('snippet', '')}"
472
+ url = result.get("url", "")
473
+
474
+ for company in self.find_mentions(text):
475
+ cd = company_data[company]
476
+ cd["mentions"] += 1
477
+ cd["snippets"].append(result.get("snippet", "")[:200])
478
+ cd["urls"].append(url)
479
+ cd["events"].extend(self.extract_events(text, company))
480
+ cd["money"].extend(MONEY_PATTERN.findall(text))
481
+
482
+ competitors = []
483
+ for company, info in COMPANY_DEFINITIONS.items():
484
+ data = company_data[company]
485
+
486
+ status = info["status"] if info.get("verified") else self.detect_status(data["snippets"], info["status"])
487
+
488
+ competitors.append({
489
+ "name": company,
490
+ "country": info["country"],
491
+ "product": info["product"],
492
+ "status": status,
493
+ "stock": self.extract_stock(data["snippets"]),
494
+ "funding": self.extract_funding(data["money"]),
495
+ "notes": data["snippets"][0] if data["snippets"] else "",
496
+ "mentions": data["mentions"],
497
+ "events": [{"date": e["date"], "event": e["context"][:100]} for e in data["events"][:10]],
498
+ "sample_urls": list(set(data["urls"]))[:5],
499
+ })
500
+
501
+ competitors.sort(key=lambda x: x["mentions"], reverse=True)
502
+
503
+ all_snippets = []
504
+ for company, data in company_data.items():
505
+ all_snippets.extend(data["snippets"])
506
+ new_opportunity = self.detect_opportunities(competitors, all_snippets)
507
+
508
+ existing = self.load_existing_data()
509
+ existing_opp = existing.get("opportunity", {}) if existing else {}
510
+
511
+ if existing_opp.get("confirmed"):
512
+ if self._opportunity_changed(new_opportunity.get("raw_opportunities", []), existing_opp):
513
+ opportunity = existing_opp
514
+ opportunity["update_available"] = True
515
+ opportunity["suggested_update"] = new_opportunity
516
+ else:
517
+ opportunity = existing_opp
518
+ opportunity["update_available"] = False
519
+ else:
520
+ opportunity = new_opportunity
521
+ opportunity["confirmed"] = False
522
+ opportunity["update_available"] = False
523
+
524
+ return {
525
+ "competitors": competitors,
526
+ "market": {"size_2024": 2_000_000_000, "size_2029_ai": 9_100_000_000, "cagr": 0.278},
527
+ "opportunity": opportunity,
528
+ "_generated": datetime.now().isoformat(),
529
+ "_source_files": [f.name for f in self.research_dir.glob("*.json") if not f.name.startswith(".")]
530
+ }
531
+
532
+ def save(self, data: dict = None) -> Path:
533
+ data = data or self.process()
534
+ self.output_file.parent.mkdir(parents=True, exist_ok=True)
535
+ with open(self.output_file, "w") as f:
536
+ json.dump(data, f, indent=2)
537
+ return self.output_file
intel.py ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Deep competitive intelligence agent.
3
+
4
+ Runs structured research across categories for a single competitor,
5
+ producing markdown + JSON reports with [CONFIRMED]/[SPECULATIVE] tagging.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import re
12
+ import time
13
+ from dataclasses import dataclass, field
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+
17
+ try:
18
+ from .config import RESEARCH_DIR, LLM_ENABLED
19
+ from .search import SearchService, WebSearchResult
20
+ from .llm import LLMClient
21
+ except ImportError:
22
+ from config import RESEARCH_DIR, LLM_ENABLED
23
+ from search import SearchService, WebSearchResult
24
+ from llm import LLMClient
25
+
26
+
27
+ # ============================================================
28
+ # Intel Constants
29
+ # ============================================================
30
+
31
+ DEEP_INTEL_CATEGORIES = {
32
+ "company_overview": {
33
+ "label": "Company Overview",
34
+ "queries": [
35
+ "{company} founding history milestones",
36
+ "{company} CEO leadership team background",
37
+ "{company} funding rounds investors valuation",
38
+ "{company} employee count headcount growth",
39
+ ],
40
+ },
41
+ "product_technology": {
42
+ "label": "Product & Technology",
43
+ "queries": [
44
+ "{company} exoskeleton rehabilitation robot product specifications",
45
+ "{company} AI machine learning technology capabilities",
46
+ "{company} new product launch release 2025 2026",
47
+ "{company} patent filings exoskeleton rehabilitation innovation",
48
+ "site:patents.google.com {company} exoskeleton OR rehabilitation",
49
+ ],
50
+ },
51
+ "regulatory_clinical": {
52
+ "label": "Regulatory & Clinical",
53
+ "queries": [
54
+ "site:accessdata.fda.gov {company}",
55
+ "{company} FDA 510k clearance CE mark MDR approval",
56
+ "site:clinicaltrials.gov {company} rehabilitation",
57
+ "{company} clinical outcomes study peer-reviewed results",
58
+ ],
59
+ },
60
+ "market_channels": {
61
+ "label": "Market & Channels",
62
+ "queries": [
63
+ "{company} hospital clinic installations customer base",
64
+ "{company} insurance reimbursement coverage CMS",
65
+ "{company} partnerships distributors resellers",
66
+ "{company} conference MEDICA ACRM CES 2025 2026",
67
+ ],
68
+ },
69
+ "vulnerabilities_threats": {
70
+ "label": "Vulnerabilities & Threats",
71
+ "queries": [
72
+ "{company} weaknesses problems criticism recall",
73
+ "{company} layoffs restructuring financial difficulty",
74
+ "{company} Glassdoor employee reviews satisfaction",
75
+ "{company} rehabilitation robotics AI expansion strategy 2025 2026",
76
+ "{company} acquisitions mergers market share growth",
77
+ "site:sec.gov {company} 10-K OR 8-K",
78
+ ],
79
+ },
80
+ }
81
+
82
+ PRIMARY_SOURCE_DOMAINS = {
83
+ "sec.gov", "fda.gov", "clinicaltrials.gov", "patents.google.com",
84
+ "accessdata.fda.gov",
85
+ "crunchbase.com", "tracxn.com", "pitchbook.com", "cbinsights.com",
86
+ "bloomberg.com", "reuters.com", "wsj.com", "finance.yahoo.com",
87
+ "wellfound.com",
88
+ "linkedin.com", "glassdoor.com",
89
+ "g2.com", "capterra.com", "trustpilot.com",
90
+ "therobotreport.com", "exoskeletonreport.com", "medgadget.com",
91
+ }
92
+
93
+ CATEGORY_EXPECTED = {
94
+ "company_overview": {
95
+ "founding_year": [r'(?:founded|established|incorporated|started)\s+(?:in\s+)?(\d{4})'],
96
+ "leadership": [r'(?:CEO|Chief Executive|CTO|CFO|President|Founder|Chairman|COO)'],
97
+ "funding": [r'\$[\d,.]+\s*(?:million|billion|M|B)', r'(?:series\s+[A-F]|seed|IPO|funding\s+round)'],
98
+ "employees": [r'(\d[\d,]*)\s*(?:employees|staff|headcount|team\s+members|workers)'],
99
+ },
100
+ "product_technology": {
101
+ "products": [r'(?:product|device|robot|exoskeleton|system)\s'],
102
+ "technology": [r'(?:AI|machine\s+learning|deep\s+learning|sensor|actuator|algorithm|neural)'],
103
+ "patents": [r'(?:patent|IP|intellectual\s+property|invention)'],
104
+ "recent_launches": [r'(?:launch|release|announc|unveil|introduc)\w*\s+.{0,30}(?:2025|2026)'],
105
+ },
106
+ "regulatory_clinical": {
107
+ "fda_clearance": [r'(?:510\(?k\)?|FDA.?clear|FDA.?approv|de\s*novo)'],
108
+ "ce_mark": [r'(?:CE.?mark|MDR|EU.?approv|notified.?body)'],
109
+ "clinical_trials": [r'(?:clinical.?trial|NCT\d|randomized|controlled.?study|peer.?review)'],
110
+ "clinical_outcomes": [r'(?:outcome|efficacy|recovery.?rate|improvement|functional.?score)'],
111
+ },
112
+ "market_channels": {
113
+ "installations": [r'(?:hospital|clinic|center|install|deploy|site)\s'],
114
+ "reimbursement": [r'(?:reimburse|insurance|CMS|Medicare|Medicaid|HCPCS|coverage|payer)'],
115
+ "partnerships": [r'(?:partner|alliance|collaborat|distribut|reseller|dealer)'],
116
+ "events": [r'(?:conference|MEDICA|ACRM|CES|expo|trade\s+show|summit)'],
117
+ },
118
+ "vulnerabilities_threats": {
119
+ "weaknesses": [r'(?:weakness|problem|challenge|struggle|fail|recall|warning)'],
120
+ "financial_stress": [r'(?:layoff|restructur|loss|declining|debt|delist|penny.stock)'],
121
+ "employee_sentiment": [r'(?:glassdoor|employee.?review|work.?culture|turnover)'],
122
+ "expansion": [r'(?:expansion|new.?market|acqui|merger|market.?share|growth.?strategy)'],
123
+ },
124
+ }
125
+
126
+ CATEGORY_SYNTHESIS_QUESTIONS = {
127
+ "company_overview": [
128
+ "Founding story and key milestones",
129
+ "Leadership team (backgrounds, medical device experience)",
130
+ "Funding history (rounds, investors, valuations)",
131
+ "Employee count and growth trajectory",
132
+ ],
133
+ "product_technology": [
134
+ "Product catalog (devices, indications, patient populations)",
135
+ "AI / machine learning capabilities (data they train on, algorithms used)",
136
+ "Recent product launches and roadmap clues (last 12 months)",
137
+ "Patent portfolio and innovation direction",
138
+ "How does their technology compare to GURMA.ai's outcome-data approach?",
139
+ ],
140
+ "regulatory_clinical": [
141
+ "FDA clearances (510(k) numbers, De Novo, dates)",
142
+ "CE mark / MDR status in Europe",
143
+ "Active clinical trials (ClinicalTrials.gov entries, endpoints)",
144
+ "Published clinical outcomes (peer-reviewed studies, recovery rates)",
145
+ "Reimbursement status (CMS, Medicare, private payer coverage)",
146
+ ],
147
+ "market_channels": [
148
+ "Hospital and clinic installations (how many sites, which countries)",
149
+ "Insurance and reimbursement strategy (pricing, payer relationships)",
150
+ "Distribution partnerships and reseller network",
151
+ "Conference and KOL presence (MEDICA, ACRM, physician endorsements)",
152
+ ],
153
+ "vulnerabilities_threats": [
154
+ "What are they bad at? (clinical limitations, missing indications)",
155
+ "Financial health (SEC filings, cash burn, stock trajectory)",
156
+ "Employee sentiment (Glassdoor, hiring patterns, layoffs)",
157
+ "Growth strategy (acquisitions, new markets, AI investments)",
158
+ "What could they do that would hurt GURMA.ai most?",
159
+ "Early warning signals to monitor",
160
+ ],
161
+ }
162
+
163
+
164
+ # ============================================================
165
+ # Intel Agent
166
+ # ============================================================
167
+
168
+ @dataclass
169
+ class IntelSection:
170
+ category: str
171
+ label: str
172
+ queries_executed: list = field(default_factory=list)
173
+ results: list = field(default_factory=list)
174
+ findings: list = field(default_factory=list)
175
+ gaps: list = field(default_factory=list)
176
+ sources: list = field(default_factory=list)
177
+
178
+
179
+ class CompetitorIntelAgent:
180
+ """Deep competitive intelligence agent for a single competitor.
181
+
182
+ Usage:
183
+ agent = CompetitorIntelAgent("Ekso Bionics")
184
+ report = agent.run()
185
+ report = agent.run(use_external_llm=True)
186
+ """
187
+
188
+ def __init__(self, company: str, search: SearchService = None, llm: LLMClient = None):
189
+ self.company = company
190
+ self.search = search or SearchService()
191
+ self.llm = llm or LLMClient()
192
+ self.sections: dict[str, IntelSection] = {}
193
+ self.output_dir = RESEARCH_DIR / "intel"
194
+ self.output_dir.mkdir(parents=True, exist_ok=True)
195
+
196
+ def run(self, categories: list[str] = None, use_external_llm: bool = False,
197
+ delay: float = 1.0, max_results: int = 10) -> Path:
198
+ cats = categories or list(DEEP_INTEL_CATEGORIES.keys())
199
+
200
+ total_queries = sum(
201
+ len(DEEP_INTEL_CATEGORIES[c]["queries"])
202
+ for c in cats if c in DEEP_INTEL_CATEGORIES
203
+ )
204
+
205
+ print(f"\n{'='*60}")
206
+ print(f"Deep Competitive Intelligence: {self.company}")
207
+ print(f"Categories: {len(cats)} | Queries: ~{total_queries}")
208
+ print(f"Analysis: built-in{' + external LLM' if use_external_llm and self.llm.enabled else ''}")
209
+ print(f"{'='*60}\n")
210
+
211
+ for cat_key in cats:
212
+ cat = DEEP_INTEL_CATEGORIES.get(cat_key)
213
+ if not cat:
214
+ print(f"[SKIP] Unknown category: {cat_key}")
215
+ continue
216
+
217
+ section = IntelSection(category=cat_key, label=cat["label"])
218
+ self._research_category(section, cat, use_external_llm, delay, max_results)
219
+ self.sections[cat_key] = section
220
+
221
+ report_path = self._generate_report(use_external_llm)
222
+ self._save_data()
223
+
224
+ print(f"\n{'='*60}")
225
+ print(f"Report: {report_path}")
226
+ total_findings = sum(len(s.findings) for s in self.sections.values())
227
+ total_gaps = sum(len(s.gaps) for s in self.sections.values())
228
+ print(f"Findings: {total_findings} | Gaps: {total_gaps}")
229
+ print(f"{'='*60}\n")
230
+
231
+ return report_path
232
+
233
+ def _research_category(self, section: IntelSection, cat: dict,
234
+ use_external_llm: bool, delay: float, max_results: int):
235
+ print(f"\n--- {section.label} ---")
236
+
237
+ queries = [q.format(company=self.company) for q in cat["queries"]]
238
+
239
+ if use_external_llm and self.llm.enabled:
240
+ extra = self.llm.generate_category_queries(self.company, section.label)
241
+ if extra:
242
+ queries.extend(extra)
243
+ print(f" [EXTERNAL LLM] +{len(extra)} additional queries")
244
+
245
+ for query in queries:
246
+ print(f" [SEARCH] {query}")
247
+ try:
248
+ results = self.search.search(query, max_results=max_results, save=True)
249
+ section.queries_executed.append(query)
250
+ section.results.extend(results)
251
+ for r in results:
252
+ if r.url and r.url not in section.sources:
253
+ section.sources.append(r.url)
254
+ print(f" -> {len(results)} results")
255
+ except Exception as e:
256
+ print(f" -> Error: {e}")
257
+
258
+ if delay > 0:
259
+ time.sleep(delay)
260
+
261
+ section.findings = self._analyze_section(section)
262
+ section.gaps = self._detect_gaps(section)
263
+
264
+ confirmed = sum(1 for f in section.findings if f.get("confirmed"))
265
+ speculative = len(section.findings) - confirmed
266
+ print(f" [ANALYSIS] {len(section.findings)} findings ({confirmed} confirmed, {speculative} speculative)")
267
+ if section.gaps:
268
+ print(f" [GAPS] {len(section.gaps)}: {', '.join(g['text'] for g in section.gaps[:3])}")
269
+
270
+ if self.llm.enabled and section.results:
271
+ print(f" [SYNTHESIS] Synthesizing {section.label}...")
272
+ synthesis = self.llm.synthesize_intel(
273
+ self.company, section.category, section.label, section.results
274
+ )
275
+ synth_findings = synthesis.get("findings", [])
276
+ synth_gaps = synthesis.get("gaps", [])
277
+
278
+ if synth_findings:
279
+ synth_sources = {f.get("source", "") for f in synth_findings if f.get("source")}
280
+ for bf in section.findings:
281
+ if bf.get("source") and bf["source"] not in synth_sources:
282
+ synth_findings.append(bf)
283
+ section.findings = synth_findings
284
+
285
+ for f in synth_findings:
286
+ if isinstance(f, dict):
287
+ tag = "[CONFIRMED]" if f.get("confirmed") else "[SPECULATIVE]"
288
+ print(f" {tag} {f.get('text', '')[:80]}")
289
+
290
+ existing_gaps = {g["text"].lower() for g in section.gaps}
291
+ for sg in synth_gaps:
292
+ gap_text = sg.get("text", sg) if isinstance(sg, dict) else sg
293
+ if gap_text.lower() not in existing_gaps:
294
+ section.gaps.append({"text": gap_text})
295
+
296
+ def _analyze_section(self, section: IntelSection) -> list[dict]:
297
+ findings = []
298
+ seen_keys = set()
299
+ aliases = self._get_aliases()
300
+
301
+ for r in section.results:
302
+ text_lower = f"{r.title} {r.snippet}".lower()
303
+
304
+ if not any(alias in text_lower for alias in aliases):
305
+ continue
306
+
307
+ dedup_key = re.sub(r'[^a-z0-9]', '', r.title.lower()[:50])
308
+ if dedup_key in seen_keys:
309
+ continue
310
+ seen_keys.add(dedup_key)
311
+
312
+ confirmed = self._is_primary_source(r.url)
313
+
314
+ title = r.title.strip()
315
+ snippet = r.snippet.strip()[:250]
316
+ finding_text = f"{title}: {snippet}" if snippet else title
317
+
318
+ findings.append({
319
+ "text": finding_text,
320
+ "source": r.url,
321
+ "confirmed": confirmed,
322
+ })
323
+
324
+ findings.sort(key=lambda f: (not f["confirmed"], -len(f["text"])))
325
+ return findings[:15]
326
+
327
+ def _is_primary_source(self, url: str) -> bool:
328
+ if not url:
329
+ return False
330
+ url_lower = url.lower()
331
+
332
+ for domain in PRIMARY_SOURCE_DOMAINS:
333
+ if domain in url_lower:
334
+ return True
335
+
336
+ for alias in self._get_aliases():
337
+ slug = alias.replace(" ", "")
338
+ if len(slug) >= 4 and slug in url_lower.split("/")[2] if len(url_lower.split("/")) > 2 else False:
339
+ return True
340
+
341
+ return False
342
+
343
+ def _detect_gaps(self, section: IntelSection) -> list[dict]:
344
+ expected = CATEGORY_EXPECTED.get(section.category, {})
345
+ if not expected:
346
+ return []
347
+
348
+ aliases = self._get_aliases()
349
+ relevant_text = " ".join(
350
+ f"{r.title} {r.snippet}"
351
+ for r in section.results
352
+ if any(a in f"{r.title} {r.snippet}".lower() for a in aliases)
353
+ )
354
+
355
+ if not relevant_text:
356
+ return [{"text": f"No relevant results found for {section.label}"}]
357
+
358
+ relevant_lower = relevant_text.lower()
359
+ gaps = []
360
+ for field_name, patterns in expected.items():
361
+ found = any(
362
+ re.search(p, relevant_lower, re.IGNORECASE)
363
+ for p in patterns
364
+ )
365
+ if not found:
366
+ label = field_name.replace("_", " ").replace("/", " / ")
367
+ gaps.append({"text": f"No data found for: {label}"})
368
+
369
+ return gaps
370
+
371
+ def _get_aliases(self) -> list[str]:
372
+ try:
373
+ from .extract import COMPANY_DEFINITIONS
374
+ except ImportError:
375
+ from extract import COMPANY_DEFINITIONS
376
+ info = COMPANY_DEFINITIONS.get(self.company, {})
377
+ aliases = info.get("aliases", [])
378
+ if not aliases:
379
+ aliases = [self.company.lower()]
380
+ return aliases
381
+
382
+ def _generate_report(self, use_external_llm: bool) -> Path:
383
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
384
+ slug = self.company.lower().replace(" ", "-").replace("/", "-")
385
+ slug = "".join(c for c in slug if c.isalnum() or c == "-")
386
+
387
+ report_path = self.output_dir / f"{timestamp}_{slug}_intel.md"
388
+
389
+ method = "Built-in analysis"
390
+ if LLM_ENABLED:
391
+ method += " + LLM synthesis (OpenRouter)"
392
+ if use_external_llm:
393
+ method += " + extra query generation"
394
+
395
+ lines = [
396
+ f"# Competitive Intelligence: {self.company}",
397
+ "",
398
+ f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')} ",
399
+ f"**Method:** {method} ",
400
+ f"**Searches:** {sum(len(s.queries_executed) for s in self.sections.values())} ",
401
+ f"**Sources:** {sum(len(s.sources) for s in self.sections.values())} unique URLs",
402
+ "",
403
+ "> **Legend:** [CONFIRMED] = from primary/verified source | [SPECULATIVE] = inferred or unverified",
404
+ "",
405
+ "---",
406
+ ]
407
+
408
+ for section in self.sections.values():
409
+ lines.append("")
410
+ lines.append(f"## {section.label}")
411
+ lines.append("")
412
+
413
+ if not section.findings:
414
+ lines.append("*No findings. Try broader queries or `--external-llm` for additional analysis.*")
415
+ lines.append("")
416
+ continue
417
+
418
+ for f in section.findings:
419
+ if isinstance(f, dict):
420
+ tag = "[CONFIRMED]" if f.get("confirmed") else "[SPECULATIVE]"
421
+ text = f.get("text", "")
422
+ source = f.get("source", "")
423
+ lines.append(f"- **{tag}** {text}")
424
+ if source:
425
+ lines.append(f" - Source: {source}")
426
+ else:
427
+ lines.append(f"- {f}")
428
+
429
+ if section.gaps:
430
+ lines.append("")
431
+ lines.append("**Knowledge Gaps:**")
432
+ for gap in section.gaps:
433
+ gap_text = gap.get("text", gap) if isinstance(gap, dict) else gap
434
+ lines.append(f"- [ ] {gap_text}")
435
+
436
+ lines.append("")
437
+
438
+ if section.sources:
439
+ lines.append(f"<details><summary>Sources ({len(section.sources)} URLs)</summary>")
440
+ lines.append("")
441
+ for url in section.sources[:10]:
442
+ lines.append(f"- {url}")
443
+ if len(section.sources) > 10:
444
+ lines.append(f"- ... and {len(section.sources) - 10} more")
445
+ lines.append("")
446
+ lines.append("</details>")
447
+ lines.append("")
448
+
449
+ lines.extend(["---", "", "## Summary", ""])
450
+
451
+ total_findings = sum(len(s.findings) for s in self.sections.values())
452
+ confirmed = sum(
453
+ sum(1 for f in s.findings if isinstance(f, dict) and f.get("confirmed"))
454
+ for s in self.sections.values()
455
+ )
456
+ speculative = total_findings - confirmed
457
+
458
+ lines.append(f"| Metric | Count |")
459
+ lines.append(f"|--------|-------|")
460
+ lines.append(f"| Total findings | {total_findings} |")
461
+ lines.append(f"| Confirmed | {confirmed} |")
462
+ lines.append(f"| Speculative | {speculative} |")
463
+ lines.append(f"| Categories | {len(self.sections)} |")
464
+ lines.append("")
465
+
466
+ all_gaps = []
467
+ for s in self.sections.values():
468
+ for g in s.gaps:
469
+ gap_text = g.get("text", g) if isinstance(g, dict) else g
470
+ all_gaps.append(f"{s.label}: {gap_text}")
471
+
472
+ if all_gaps:
473
+ lines.append("### Outstanding Gaps")
474
+ lines.append("")
475
+ for gap in all_gaps:
476
+ lines.append(f"- [ ] {gap}")
477
+ lines.append("")
478
+
479
+ with open(report_path, "w") as f:
480
+ f.write("\n".join(lines))
481
+
482
+ return report_path
483
+
484
+ def _save_data(self):
485
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
486
+ slug = self.company.lower().replace(" ", "-").replace("/", "-")
487
+ slug = "".join(c for c in slug if c.isalnum() or c == "-")
488
+
489
+ data = {
490
+ "company": self.company,
491
+ "generated": datetime.now().isoformat(),
492
+ "sections": {},
493
+ }
494
+
495
+ for cat_key, section in self.sections.items():
496
+ data["sections"][cat_key] = {
497
+ "label": section.label,
498
+ "queries_executed": section.queries_executed,
499
+ "finding_count": len(section.findings),
500
+ "findings": section.findings,
501
+ "gaps": section.gaps,
502
+ "source_count": len(section.sources),
503
+ "sources": section.sources[:20],
504
+ }
505
+
506
+ json_path = self.output_dir / f"{timestamp}_{slug}_intel.json"
507
+ with open(json_path, "w") as f:
508
+ json.dump(data, f, indent=2)
llm.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenRouter LLM client for research analysis and synthesis.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import re
9
+ import sys
10
+ from typing import Optional
11
+
12
+ try:
13
+ from .config import OPENROUTER_API_KEY, LLM_MODEL
14
+ except ImportError:
15
+ from config import OPENROUTER_API_KEY, LLM_MODEL
16
+
17
+
18
+ class LLMClient:
19
+ """OpenRouter LLM client for research analysis."""
20
+
21
+ def __init__(self, api_key: str = None, model: str = LLM_MODEL):
22
+ self.api_key = api_key or OPENROUTER_API_KEY
23
+ self.model = model
24
+ self.enabled = bool(self.api_key)
25
+
26
+ def call(self, prompt: str, system: str = None, max_tokens: int = 1000) -> Optional[str]:
27
+ if not self.enabled:
28
+ return None
29
+
30
+ import requests
31
+
32
+ messages = []
33
+ if system:
34
+ messages.append({"role": "system", "content": system})
35
+ messages.append({"role": "user", "content": prompt})
36
+
37
+ try:
38
+ response = requests.post(
39
+ "https://openrouter.ai/api/v1/chat/completions",
40
+ headers={
41
+ "Authorization": f"Bearer {self.api_key}",
42
+ "Content-Type": "application/json",
43
+ },
44
+ json={
45
+ "model": self.model,
46
+ "messages": messages,
47
+ "max_tokens": max_tokens,
48
+ "temperature": 0.3,
49
+ },
50
+ timeout=60
51
+ )
52
+ response.raise_for_status()
53
+ return response.json()["choices"][0]["message"]["content"]
54
+ except Exception as e:
55
+ print(f"[LLM ERROR] {e}", file=sys.stderr)
56
+ return None
57
+
58
+ def generate_category_queries(self, company: str, category_label: str) -> list[str]:
59
+ """Generate additional search queries for a specific intel category."""
60
+ system = (
61
+ "You are a competitive intelligence analyst specializing in "
62
+ "rehabilitation robotics and medical devices. "
63
+ "Generate specific, targeted web search queries. "
64
+ "Return ONLY a JSON array of query strings. "
65
+ "Focus on recent sources (last 18 months). Prioritize primary sources."
66
+ )
67
+
68
+ prompt = f"""Company: {company}
69
+ Category: {category_label}
70
+
71
+ Generate 3-4 additional specific search queries for deep competitive intelligence on this company in this category.
72
+ Focus on primary sources: company blog, official announcements, SEC filings, patent databases, verified review sites, job postings.
73
+ Return as JSON array: ["query1", "query2", ...]"""
74
+
75
+ response = self.call(prompt, system)
76
+ if response:
77
+ match = re.search(r'\[.*\]', response, re.DOTALL)
78
+ if match:
79
+ try:
80
+ return json.loads(match.group())[:4]
81
+ except Exception:
82
+ pass
83
+ return []
84
+
85
+ def synthesize_intel(self, company: str, category_key: str,
86
+ category_label: str, results: list,
87
+ synthesis_questions: dict = None) -> dict:
88
+ """Synthesize search results into structured intelligence.
89
+
90
+ Uses per-category questions to produce distilled, actionable findings.
91
+ Returns dict with 'findings' and 'gaps'.
92
+ """
93
+ try:
94
+ from .intel import CATEGORY_SYNTHESIS_QUESTIONS
95
+ except ImportError:
96
+ from intel import CATEGORY_SYNTHESIS_QUESTIONS
97
+
98
+ questions = (synthesis_questions or CATEGORY_SYNTHESIS_QUESTIONS).get(category_key, [])
99
+ if not questions:
100
+ return {"findings": [], "gaps": []}
101
+
102
+ questions_text = "\n".join(f"- {q}" for q in questions)
103
+
104
+ results_text = "\n".join([
105
+ f"- [{r.source}] {r.title}\n {r.snippet[:300]}\n URL: {r.url}"
106
+ for r in results[:15]
107
+ ])
108
+
109
+ system = (
110
+ "You are a competitive intelligence analyst for GURMA.ai, "
111
+ "a Swiss AI company entering rehabilitation robotics with "
112
+ "15 years of patient outcome data from BAMA Teknoloji. "
113
+ "Synthesize search results into actionable intelligence. "
114
+ "Recent sources only (last 18 months). "
115
+ "Flag speculation vs confirmed facts. Include URLs."
116
+ )
117
+
118
+ prompt = f"""Conduct deep competitive intelligence on {company}.
119
+ Category: {category_label}
120
+
121
+ Answer these specific questions based on the search results:
122
+ {questions_text}
123
+
124
+ Search results:
125
+ {results_text}
126
+
127
+ Return JSON:
128
+ {{
129
+ "findings": [
130
+ {{"text": "synthesized answer to one of the questions", "confirmed": true, "source": "url"}},
131
+ {{"text": "inferred insight", "confirmed": false, "source": "url or empty"}}
132
+ ],
133
+ "gaps": [
134
+ {{"text": "question that could NOT be answered from search results"}}
135
+ ]
136
+ }}
137
+
138
+ Rules:
139
+ - confirmed=true ONLY for facts from primary sources (company website, SEC filings, press releases)
140
+ - confirmed=false for inferred or secondary-source information
141
+ - Each finding should directly answer one of the questions above
142
+ - Be specific and quantitative where possible
143
+ - If a question cannot be answered, add it to gaps
144
+ - Maximum 12 findings"""
145
+
146
+ response = self.call(prompt, system, max_tokens=2000)
147
+ if response:
148
+ match = re.search(r'\{.*\}', response, re.DOTALL)
149
+ if match:
150
+ try:
151
+ return json.loads(match.group())
152
+ except Exception:
153
+ pass
154
+ return {"findings": [], "gaps": []}
research.py CHANGED
@@ -1,1922 +1,70 @@
1
  #!/usr/bin/env python3
2
  """
3
- GURMA.ai Research Tool
4
-
5
- Unified research tool combining:
6
- - Multi-backend web search (DuckDuckGo, SerpAPI, Brave)
7
- - Result storage and retrieval
8
- - Batch research runs
9
- - Deep competitive intelligence with LLM synthesis (via OpenRouter)
10
-
11
- Usage:
12
- # Single search
13
- python research.py search "rehabilitation robotics market"
14
-
15
- # Batch research on all competitors
16
- python research.py batch
17
-
18
- # Deep competitive intelligence on a single company
19
- python research.py competitor "Ekso Bionics"
20
- python research.py competitor "Fourier Intelligence" --external-llm
21
- python research.py competitor "Cyberdyne" -c company_overview,product_deep_dive
22
- python research.py competitor --list-categories
23
-
24
- # Extract to competitors.json (for dashboard)
25
- python research.py extract
26
-
27
- # List saved results
28
- python research.py list
29
  """
30
 
31
- from __future__ import annotations
32
-
33
- import argparse
34
- import json
35
- import os
36
- import re
37
- import sys
38
- import time
39
- from abc import ABC, abstractmethod
40
- from dataclasses import dataclass, field, asdict
41
- from datetime import datetime, timedelta
42
- from pathlib import Path
43
- from typing import Optional, Protocol
44
-
45
- # ============================================================
46
- # Configuration
47
- # ============================================================
48
-
49
- # Detect environment: HF Space (Docker at /app) vs local development
50
- def _detect_project_root() -> Path:
51
- """Detect project root based on environment."""
52
- # HF Spaces: running from /app with research.py in root
53
- if os.getenv("HF_SPACE") or Path("/app/research.py").exists():
54
- return Path("/app")
55
- # Local: research.py is in src/utils/
56
- return Path(__file__).parent.parent.parent
57
-
58
- PROJECT_ROOT = _detect_project_root()
59
- IS_HF_SPACE = PROJECT_ROOT == Path("/app")
60
-
61
- # Load .env if present (local development)
62
- if not IS_HF_SPACE:
63
- try:
64
- from dotenv import load_dotenv
65
- load_dotenv(PROJECT_ROOT / ".env")
66
- except ImportError:
67
- pass
68
-
69
- # Directories - different structure for HF Space vs local
70
- if IS_HF_SPACE:
71
- RESEARCH_DIR = PROJECT_ROOT / "data"
72
- DATA_DIR = PROJECT_ROOT / "data"
73
- else:
74
- RESEARCH_DIR = PROJECT_ROOT / "data"
75
- DATA_DIR = PROJECT_ROOT / "src" / "dashboard"
76
-
77
- # Ensure directories exist
78
- RESEARCH_DIR.mkdir(parents=True, exist_ok=True)
79
- DATA_DIR.mkdir(parents=True, exist_ok=True)
80
-
81
- # API Keys
82
- SERPAPI_KEY = os.getenv("SERPAPI_KEY")
83
- BRAVE_API_KEY = os.getenv("BRAVE_API_KEY")
84
- OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
85
-
86
- # LLM Config
87
- LLM_MODEL = "deepseek/deepseek-chat"
88
- LLM_ENABLED = bool(OPENROUTER_API_KEY)
89
-
90
- # Known competitors for batch research
91
- COMPETITORS = [
92
- "Hocoma", "Ekso Bionics", "Lifeward ReWalk", "Fourier Intelligence",
93
- "Cyberdyne HAL", "Wandercraft", "Myomo", "Bionik",
94
- ]
95
-
96
- # Query templates for batch research
97
- BATCH_QUERY_TEMPLATES = [
98
- "{company} latest news 2025 2026",
99
- "{company} funding investors valuation",
100
- "{company} FDA approval regulatory",
101
- "{company} partnerships collaborations",
102
- "{company} AI machine learning technology",
103
- # Targeted regulatory sources
104
- "site:accessdata.fda.gov {company}", # FDA 510(k) clearances
105
- "site:clinicaltrials.gov {company} rehabilitation", # Clinical trials
106
- # Funding & corporate
107
- "site:crunchbase.com {company}", # Funding history
108
- "site:sec.gov {company} 10-K OR 8-K", # SEC filings (public companies)
109
- # Patents & innovation
110
- "site:patents.google.com {company} exoskeleton OR rehabilitation",
111
- ]
112
-
113
- MARKET_QUERIES = [
114
- "rehabilitation robotics market size 2026 forecast",
115
- "exoskeleton market growth AI integration",
116
- "rehabilitation robotics insurance reimbursement",
117
- "medical exoskeleton FDA approval 2025",
118
- "stroke rehabilitation AI technology",
119
- "spinal cord injury exoskeleton treatment",
120
- "rehabilitation robotics competitive landscape",
121
- # Industry publications
122
- "site:exoskeletonreport.com 2025 2026", # Industry news
123
- "site:medgadget.com exoskeleton rehabilitation", # Med-tech news
124
- # Regulatory landscape
125
- "site:fda.gov rehabilitation robotics guidance",
126
- "MDR medical device regulation exoskeleton CE mark 2025",
127
- # Academic/clinical
128
- "site:pubmed.ncbi.nlm.nih.gov rehabilitation robotics AI 2024 2025",
129
- # Insurance/reimbursement (key for Holland market)
130
- "exoskeleton insurance coverage CMS reimbursement code",
131
- "rehabilitation robotics HCPCS code billing",
132
- ]
133
-
134
- # Deep competitive intelligence query templates by category
135
- # Tailored for rehabilitation robotics / medical device companies (~25 queries)
136
- DEEP_INTEL_CATEGORIES = {
137
- "company_overview": {
138
- "label": "Company Overview",
139
- "queries": [
140
- "{company} founding history milestones",
141
- "{company} CEO leadership team background",
142
- "{company} funding rounds investors valuation",
143
- "{company} employee count headcount growth",
144
- ],
145
- },
146
- "product_technology": {
147
- "label": "Product & Technology",
148
- "queries": [
149
- "{company} exoskeleton rehabilitation robot product specifications",
150
- "{company} AI machine learning technology capabilities",
151
- "{company} new product launch release 2025 2026",
152
- "{company} patent filings exoskeleton rehabilitation innovation",
153
- "site:patents.google.com {company} exoskeleton OR rehabilitation",
154
- ],
155
- },
156
- "regulatory_clinical": {
157
- "label": "Regulatory & Clinical",
158
- "queries": [
159
- "site:accessdata.fda.gov {company}",
160
- "{company} FDA 510k clearance CE mark MDR approval",
161
- "site:clinicaltrials.gov {company} rehabilitation",
162
- "{company} clinical outcomes study peer-reviewed results",
163
- ],
164
- },
165
- "market_channels": {
166
- "label": "Market & Channels",
167
- "queries": [
168
- "{company} hospital clinic installations customer base",
169
- "{company} insurance reimbursement coverage CMS",
170
- "{company} partnerships distributors resellers",
171
- "{company} conference MEDICA ACRM CES 2025 2026",
172
- ],
173
- },
174
- "vulnerabilities_threats": {
175
- "label": "Vulnerabilities & Threats",
176
- "queries": [
177
- "{company} weaknesses problems criticism recall",
178
- "{company} layoffs restructuring financial difficulty",
179
- "{company} Glassdoor employee reviews satisfaction",
180
- "{company} rehabilitation robotics AI expansion strategy 2025 2026",
181
- "{company} acquisitions mergers market share growth",
182
- "site:sec.gov {company} 10-K OR 8-K",
183
- ],
184
- },
185
- }
186
-
187
- # Primary/authoritative source domains for confirmed vs speculative scoring
188
- PRIMARY_SOURCE_DOMAINS = {
189
- # Regulatory / Official
190
- "sec.gov", "fda.gov", "clinicaltrials.gov", "patents.google.com",
191
- "accessdata.fda.gov",
192
- # Financial / Business data
193
- "crunchbase.com", "tracxn.com", "pitchbook.com", "cbinsights.com",
194
- "bloomberg.com", "reuters.com", "wsj.com", "finance.yahoo.com",
195
- "wellfound.com",
196
- # Professional
197
- "linkedin.com", "glassdoor.com",
198
- # Review platforms
199
- "g2.com", "capterra.com", "trustpilot.com",
200
- # Industry-specific
201
- "therobotreport.com", "exoskeletonreport.com", "medgadget.com",
202
- }
203
-
204
- # Expected data points per category — used for automatic gap detection.
205
- # Each field maps to regex patterns that indicate coverage in result text.
206
- CATEGORY_EXPECTED = {
207
- "company_overview": {
208
- "founding_year": [r'(?:founded|established|incorporated|started)\s+(?:in\s+)?(\d{4})'],
209
- "leadership": [r'(?:CEO|Chief Executive|CTO|CFO|President|Founder|Chairman|COO)'],
210
- "funding": [r'\$[\d,.]+\s*(?:million|billion|M|B)', r'(?:series\s+[A-F]|seed|IPO|funding\s+round)'],
211
- "employees": [r'(\d[\d,]*)\s*(?:employees|staff|headcount|team\s+members|workers)'],
212
- },
213
- "product_technology": {
214
- "products": [r'(?:product|device|robot|exoskeleton|system)\s'],
215
- "technology": [r'(?:AI|machine\s+learning|deep\s+learning|sensor|actuator|algorithm|neural)'],
216
- "patents": [r'(?:patent|IP|intellectual\s+property|invention)'],
217
- "recent_launches": [r'(?:launch|release|announc|unveil|introduc)\w*\s+.{0,30}(?:2025|2026)'],
218
- },
219
- "regulatory_clinical": {
220
- "fda_clearance": [r'(?:510\(?k\)?|FDA.?clear|FDA.?approv|de\s*novo)'],
221
- "ce_mark": [r'(?:CE.?mark|MDR|EU.?approv|notified.?body)'],
222
- "clinical_trials": [r'(?:clinical.?trial|NCT\d|randomized|controlled.?study|peer.?review)'],
223
- "clinical_outcomes": [r'(?:outcome|efficacy|recovery.?rate|improvement|functional.?score)'],
224
- },
225
- "market_channels": {
226
- "installations": [r'(?:hospital|clinic|center|install|deploy|site)\s'],
227
- "reimbursement": [r'(?:reimburse|insurance|CMS|Medicare|Medicaid|HCPCS|coverage|payer)'],
228
- "partnerships": [r'(?:partner|alliance|collaborat|distribut|reseller|dealer)'],
229
- "events": [r'(?:conference|MEDICA|ACRM|CES|expo|trade\s+show|summit)'],
230
- },
231
- "vulnerabilities_threats": {
232
- "weaknesses": [r'(?:weakness|problem|challenge|struggle|fail|recall|warning)'],
233
- "financial_stress": [r'(?:layoff|restructur|loss|declining|debt|delist|penny.stock)'],
234
- "employee_sentiment": [r'(?:glassdoor|employee.?review|work.?culture|turnover)'],
235
- "expansion": [r'(?:expansion|new.?market|acqui|merger|market.?share|growth.?strategy)'],
236
- },
237
- }
238
-
239
- # Per-category synthesis questions — the LLM answers these from search results.
240
- # Tailored for rehabilitation robotics / medical device competitors.
241
- CATEGORY_SYNTHESIS_QUESTIONS = {
242
- "company_overview": [
243
- "Founding story and key milestones",
244
- "Leadership team (backgrounds, medical device experience)",
245
- "Funding history (rounds, investors, valuations)",
246
- "Employee count and growth trajectory",
247
- ],
248
- "product_technology": [
249
- "Product catalog (devices, indications, patient populations)",
250
- "AI / machine learning capabilities (data they train on, algorithms used)",
251
- "Recent product launches and roadmap clues (last 12 months)",
252
- "Patent portfolio and innovation direction",
253
- "How does their technology compare to GURMA.ai's outcome-data approach?",
254
- ],
255
- "regulatory_clinical": [
256
- "FDA clearances (510(k) numbers, De Novo, dates)",
257
- "CE mark / MDR status in Europe",
258
- "Active clinical trials (ClinicalTrials.gov entries, endpoints)",
259
- "Published clinical outcomes (peer-reviewed studies, recovery rates)",
260
- "Reimbursement status (CMS, Medicare, private payer coverage)",
261
- ],
262
- "market_channels": [
263
- "Hospital and clinic installations (how many sites, which countries)",
264
- "Insurance and reimbursement strategy (pricing, payer relationships)",
265
- "Distribution partnerships and reseller network",
266
- "Conference and KOL presence (MEDICA, ACRM, physician endorsements)",
267
- ],
268
- "vulnerabilities_threats": [
269
- "What are they bad at? (clinical limitations, missing indications)",
270
- "Financial health (SEC filings, cash burn, stock trajectory)",
271
- "Employee sentiment (Glassdoor, hiring patterns, layoffs)",
272
- "Growth strategy (acquisitions, new markets, AI investments)",
273
- "What could they do that would hurt GURMA.ai most?",
274
- "Early warning signals to monitor",
275
- ],
276
- }
277
-
278
-
279
- # ============================================================
280
- # Search Backends (Open/Closed Principle)
281
- # ============================================================
282
-
283
- class SearchResult(Protocol):
284
- """Protocol for search result."""
285
- title: str
286
- url: str
287
- snippet: str
288
- source: str
289
-
290
-
291
- @dataclass
292
- class WebSearchResult:
293
- """Standard search result."""
294
- title: str
295
- url: str
296
- snippet: str
297
- source: str
298
-
299
-
300
- class SearchBackend(ABC):
301
- """Abstract base for search backends (Liskov Substitution)."""
302
-
303
- @property
304
- @abstractmethod
305
- def name(self) -> str:
306
- """Backend identifier."""
307
- pass
308
-
309
- @abstractmethod
310
- def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
311
- """Execute search and return results."""
312
- pass
313
-
314
- @abstractmethod
315
- def is_available(self) -> bool:
316
- """Check if backend is available (dependencies, API keys)."""
317
- pass
318
-
319
-
320
- class DuckDuckGoBackend(SearchBackend):
321
- """DuckDuckGo search (no API key required)."""
322
-
323
- @property
324
- def name(self) -> str:
325
- return "duckduckgo"
326
-
327
- def is_available(self) -> bool:
328
- try:
329
- from ddgs import DDGS
330
- return True
331
- except ImportError:
332
- try:
333
- from duckduckgo_search import DDGS
334
- return True
335
- except ImportError:
336
- return False
337
-
338
- def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
339
- try:
340
- from ddgs import DDGS
341
- except ImportError:
342
- from duckduckgo_search import DDGS
343
-
344
- results = []
345
- ddgs = DDGS()
346
- for r in ddgs.text(query, max_results=max_results):
347
- results.append(WebSearchResult(
348
- title=r.get("title", ""),
349
- url=r.get("href", r.get("link", "")),
350
- snippet=r.get("body", r.get("snippet", "")),
351
- source=self.name
352
- ))
353
- return results
354
-
355
-
356
- class SerpAPIBackend(SearchBackend):
357
- """SerpAPI search (requires API key)."""
358
-
359
- @property
360
- def name(self) -> str:
361
- return "serpapi"
362
-
363
- def is_available(self) -> bool:
364
- try:
365
- import requests
366
- return bool(SERPAPI_KEY)
367
- except ImportError:
368
- return False
369
-
370
- def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
371
- import requests
372
-
373
- response = requests.get(
374
- "https://serpapi.com/search",
375
- params={"q": query, "api_key": SERPAPI_KEY, "engine": "google", "num": max_results},
376
- timeout=30
377
- )
378
- response.raise_for_status()
379
- data = response.json()
380
-
381
- results = []
382
- for r in data.get("organic_results", [])[:max_results]:
383
- results.append(WebSearchResult(
384
- title=r.get("title", ""),
385
- url=r.get("link", ""),
386
- snippet=r.get("snippet", ""),
387
- source=self.name
388
- ))
389
- return results
390
-
391
-
392
- class BraveBackend(SearchBackend):
393
- """Brave search (requires API key)."""
394
-
395
- @property
396
- def name(self) -> str:
397
- return "brave"
398
-
399
- def is_available(self) -> bool:
400
- try:
401
- import requests
402
- return bool(BRAVE_API_KEY)
403
- except ImportError:
404
- return False
405
-
406
- def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
407
- import requests
408
-
409
- response = requests.get(
410
- "https://api.search.brave.com/res/v1/web/search",
411
- headers={"Accept": "application/json", "X-Subscription-Token": BRAVE_API_KEY},
412
- params={"q": query, "count": min(max_results, 20)},
413
- timeout=30
414
- )
415
- response.raise_for_status()
416
- data = response.json()
417
-
418
- results = []
419
- for r in data.get("web", {}).get("results", [])[:max_results]:
420
- results.append(WebSearchResult(
421
- title=r.get("title", ""),
422
- url=r.get("url", ""),
423
- snippet=r.get("description", ""),
424
- source=self.name
425
- ))
426
- return results
427
-
428
-
429
- # Backend registry
430
- BACKENDS: dict[str, SearchBackend] = {
431
- "duckduckgo": DuckDuckGoBackend(),
432
- "ddg": DuckDuckGoBackend(),
433
- "serpapi": SerpAPIBackend(),
434
- "brave": BraveBackend(),
435
- }
436
-
437
-
438
- def get_backend(name: str = "duckduckgo") -> SearchBackend:
439
- """Get search backend by name."""
440
- backend = BACKENDS.get(name)
441
- if not backend:
442
- raise ValueError(f"Unknown backend: {name}. Available: {list(BACKENDS.keys())}")
443
- if not backend.is_available():
444
- raise RuntimeError(f"Backend '{name}' not available. Check dependencies/API keys.")
445
- return backend
446
-
447
-
448
- # ============================================================
449
- # Result Storage (Single Responsibility)
450
- # ============================================================
451
-
452
- class ResultStorage:
453
- """Handles saving and loading search results."""
454
-
455
- def __init__(self, directory: Path = RESEARCH_DIR):
456
- self.directory = directory
457
- self.directory.mkdir(parents=True, exist_ok=True)
458
-
459
- def save(self, query: str, results: list[WebSearchResult], backend: str) -> tuple[Path, Path]:
460
- """Save results in JSON and Markdown formats. Returns (json_path, md_path)."""
461
- timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
462
- slug = self._slugify(query)
463
- base_name = f"{timestamp}_{slug}"
464
-
465
- # Build data
466
- data = {
467
- "query": query,
468
- "timestamp": datetime.now().isoformat(),
469
- "backend": backend,
470
- "result_count": len(results),
471
- "results": [asdict(r) for r in results]
472
- }
473
-
474
- # Save JSON
475
- json_path = self.directory / f"{base_name}.json"
476
- with open(json_path, "w") as f:
477
- json.dump(data, f, indent=2)
478
-
479
- # Save Markdown
480
- md_path = self.directory / f"{base_name}.md"
481
- with open(md_path, "w") as f:
482
- f.write(f"# Search: {query}\n\n")
483
- f.write(f"**Date:** {data['timestamp']} \n")
484
- f.write(f"**Backend:** {backend} \n")
485
- f.write(f"**Results:** {len(results)}\n\n---\n")
486
- for i, r in enumerate(results, 1):
487
- f.write(f"\n## {i}. {r.title}\n\n**URL:** {r.url}\n\n{r.snippet}\n")
488
-
489
- return json_path, md_path
490
-
491
- def list_searches(self, limit: int = 20) -> list[dict]:
492
- """List recent saved searches."""
493
- searches = []
494
- for json_file in sorted(self.directory.glob("*.json"), reverse=True):
495
- if json_file.name.startswith("."):
496
- continue
497
- try:
498
- with open(json_file) as f:
499
- data = json.load(f)
500
- searches.append({
501
- "file": json_file.name,
502
- "query": data.get("query", ""),
503
- "timestamp": data.get("timestamp", ""),
504
- "results": data.get("result_count", 0)
505
- })
506
- except:
507
- pass
508
- if len(searches) >= limit:
509
- break
510
- return searches
511
-
512
- def get_recent_queries(self, days: int = 7) -> set[str]:
513
- """Get queries executed within the last N days (normalized for deduplication)."""
514
- cutoff = datetime.now() - timedelta(days=days)
515
- recent = set()
516
-
517
- for json_file in self.directory.glob("*.json"):
518
- if json_file.name.startswith("."):
519
- continue
520
- try:
521
- with open(json_file) as f:
522
- data = json.load(f)
523
- ts = data.get("timestamp", "")
524
- if ts:
525
- file_date = datetime.fromisoformat(ts.replace("Z", "+00:00").split("+")[0])
526
- if file_date >= cutoff:
527
- query = data.get("query", "").lower().strip()
528
- recent.add(query)
529
- except:
530
- pass
531
- return recent
532
-
533
- def _slugify(self, text: str, max_len: int = 50) -> str:
534
- """Convert text to filesystem-safe slug."""
535
- slug = text.lower()[:max_len].replace(" ", "-").replace("/", "-")
536
- return "".join(c for c in slug if c.isalnum() or c == "-")
537
-
538
-
539
- # ============================================================
540
- # Search Service (Facade Pattern)
541
- # ============================================================
542
-
543
- class SearchService:
544
- """High-level search interface combining backend and storage."""
545
-
546
- def __init__(self, backend: str = "duckduckgo", storage: ResultStorage = None):
547
- self.backend = get_backend(backend)
548
- self.storage = storage or ResultStorage()
549
-
550
- def search(self, query: str, max_results: int = 10, save: bool = True) -> list[WebSearchResult]:
551
- """Execute search, optionally save results."""
552
- results = self.backend.search(query, max_results)
553
- if save and results:
554
- self.storage.save(query, results, self.backend.name)
555
- return results
556
-
557
- def search_batch(self, queries: list[str], max_results: int = 10,
558
- delay: float = 0.5, callback=None) -> dict[str, int]:
559
- """Execute multiple searches with rate limiting.
560
-
561
- Returns dict of {query: result_count}.
562
- """
563
- stats = {}
564
- for i, query in enumerate(queries, 1):
565
- if callback:
566
- callback(i, len(queries), query)
567
- try:
568
- results = self.search(query, max_results, save=True)
569
- stats[query] = len(results)
570
- except Exception as e:
571
- stats[query] = -1 # Error indicator
572
- print(f"Error on '{query}': {e}", file=sys.stderr)
573
-
574
- if delay > 0 and i < len(queries):
575
- time.sleep(delay)
576
-
577
- return stats
578
-
579
-
580
- # ============================================================
581
- # LLM Integration (Dependency Inversion)
582
- # ============================================================
583
-
584
- class LLMClient:
585
- """OpenRouter LLM client for research analysis."""
586
-
587
- def __init__(self, api_key: str = None, model: str = LLM_MODEL):
588
- self.api_key = api_key or OPENROUTER_API_KEY
589
- self.model = model
590
- self.enabled = bool(self.api_key)
591
-
592
- def call(self, prompt: str, system: str = None, max_tokens: int = 1000) -> Optional[str]:
593
- """Make LLM API call. Returns response text or None."""
594
- if not self.enabled:
595
- return None
596
-
597
- import requests
598
-
599
- messages = []
600
- if system:
601
- messages.append({"role": "system", "content": system})
602
- messages.append({"role": "user", "content": prompt})
603
-
604
- try:
605
- response = requests.post(
606
- "https://openrouter.ai/api/v1/chat/completions",
607
- headers={
608
- "Authorization": f"Bearer {self.api_key}",
609
- "Content-Type": "application/json",
610
- },
611
- json={
612
- "model": self.model,
613
- "messages": messages,
614
- "max_tokens": max_tokens,
615
- "temperature": 0.3,
616
- },
617
- timeout=60
618
- )
619
- response.raise_for_status()
620
- return response.json()["choices"][0]["message"]["content"]
621
- except Exception as e:
622
- print(f"[LLM ERROR] {e}", file=sys.stderr)
623
- return None
624
-
625
- def generate_category_queries(self, company: str, category_label: str) -> list[str]:
626
- """Generate additional search queries for a specific intel category."""
627
- system = (
628
- "You are a competitive intelligence analyst specializing in "
629
- "rehabilitation robotics and medical devices. "
630
- "Generate specific, targeted web search queries. "
631
- "Return ONLY a JSON array of query strings. "
632
- "Focus on recent sources (last 18 months). Prioritize primary sources."
633
- )
634
-
635
- prompt = f"""Company: {company}
636
- Category: {category_label}
637
-
638
- Generate 3-4 additional specific search queries for deep competitive intelligence on this company in this category.
639
- Focus on primary sources: company blog, official announcements, SEC filings, patent databases, verified review sites, job postings.
640
- Return as JSON array: ["query1", "query2", ...]"""
641
-
642
- response = self.call(prompt, system)
643
- if response:
644
- match = re.search(r'\[.*\]', response, re.DOTALL)
645
- if match:
646
- try:
647
- return json.loads(match.group())[:4]
648
- except Exception:
649
- pass
650
- return []
651
-
652
- def synthesize_intel(self, company: str, category_key: str,
653
- category_label: str, results: list) -> dict:
654
- """Synthesize search results into structured intelligence.
655
-
656
- Uses per-category questions from CATEGORY_SYNTHESIS_QUESTIONS to
657
- produce distilled, actionable findings instead of raw snippets.
658
- Returns dict with 'findings' and 'gaps'.
659
- """
660
- questions = CATEGORY_SYNTHESIS_QUESTIONS.get(category_key, [])
661
- if not questions:
662
- return {"findings": [], "gaps": []}
663
-
664
- questions_text = "\n".join(f"- {q}" for q in questions)
665
-
666
- results_text = "\n".join([
667
- f"- [{r.source}] {r.title}\n {r.snippet[:300]}\n URL: {r.url}"
668
- for r in results[:15]
669
- ])
670
-
671
- system = (
672
- "You are a competitive intelligence analyst for GURMA.ai, "
673
- "a Swiss AI company entering rehabilitation robotics with "
674
- "15 years of patient outcome data from BAMA Teknoloji. "
675
- "Synthesize search results into actionable intelligence. "
676
- "Recent sources only (last 18 months). "
677
- "Flag speculation vs confirmed facts. Include URLs."
678
- )
679
-
680
- prompt = f"""Conduct deep competitive intelligence on {company}.
681
- Category: {category_label}
682
-
683
- Answer these specific questions based on the search results:
684
- {questions_text}
685
-
686
- Search results:
687
- {results_text}
688
-
689
- Return JSON:
690
- {{
691
- "findings": [
692
- {{"text": "synthesized answer to one of the questions", "confirmed": true, "source": "url"}},
693
- {{"text": "inferred insight", "confirmed": false, "source": "url or empty"}}
694
- ],
695
- "gaps": [
696
- {{"text": "question that could NOT be answered from search results"}}
697
- ]
698
- }}
699
-
700
- Rules:
701
- - confirmed=true ONLY for facts from primary sources (company website, SEC filings, press releases)
702
- - confirmed=false for inferred or secondary-source information
703
- - Each finding should directly answer one of the questions above
704
- - Be specific and quantitative where possible
705
- - If a question cannot be answered, add it to gaps
706
- - Maximum 12 findings"""
707
-
708
- response = self.call(prompt, system, max_tokens=2000)
709
- if response:
710
- match = re.search(r'\{.*\}', response, re.DOTALL)
711
- if match:
712
- try:
713
- return json.loads(match.group())
714
- except Exception:
715
- pass
716
- return {"findings": [], "gaps": []}
717
-
718
-
719
- # ============================================================
720
- # Deep Competitive Intelligence Agent
721
- # ============================================================
722
-
723
- @dataclass
724
- class IntelSection:
725
- """A section of the competitive intelligence report."""
726
- category: str
727
- label: str
728
- queries_executed: list = field(default_factory=list)
729
- results: list = field(default_factory=list)
730
- findings: list = field(default_factory=list)
731
- gaps: list = field(default_factory=list)
732
- sources: list = field(default_factory=list)
733
-
734
-
735
- class CompetitorIntelAgent:
736
- """Deep competitive intelligence agent for a single competitor.
737
-
738
- Runs structured research across 7 categories and produces
739
- a markdown + JSON report with [CONFIRMED]/[SPECULATIVE] tagging.
740
-
741
- Usage:
742
- agent = CompetitorIntelAgent("Ekso Bionics")
743
- report = agent.run() # built-in analysis
744
- report = agent.run(use_external_llm=True) # + OpenRouter
745
- """
746
-
747
- def __init__(self, company: str, search: SearchService = None, llm: LLMClient = None):
748
- self.company = company
749
- self.search = search or SearchService()
750
- self.llm = llm or LLMClient()
751
- self.sections: dict[str, IntelSection] = {}
752
- self.output_dir = RESEARCH_DIR / "intel"
753
- self.output_dir.mkdir(parents=True, exist_ok=True)
754
-
755
- def run(self, categories: list[str] = None, use_external_llm: bool = False,
756
- delay: float = 1.0, max_results: int = 10) -> Path:
757
- """Run deep competitive intelligence and generate report.
758
-
759
- Built-in analysis (source scoring, dedup, gap detection) always runs.
760
-
761
- Args:
762
- categories: Which categories to research (default: all 7)
763
- use_external_llm: Also use external LLM (OpenRouter) for enhanced analysis
764
- delay: Delay between searches in seconds (rate limiting)
765
- max_results: Max results per search query
766
-
767
- Returns: Path to generated markdown report
768
- """
769
- cats = categories or list(DEEP_INTEL_CATEGORIES.keys())
770
-
771
- total_queries = sum(
772
- len(DEEP_INTEL_CATEGORIES[c]["queries"])
773
- for c in cats if c in DEEP_INTEL_CATEGORIES
774
- )
775
-
776
- print(f"\n{'='*60}")
777
- print(f"Deep Competitive Intelligence: {self.company}")
778
- print(f"Categories: {len(cats)} | Queries: ~{total_queries}")
779
- print(f"Analysis: built-in{' + external LLM' if use_external_llm and self.llm.enabled else ''}")
780
- print(f"{'='*60}\n")
781
-
782
- for cat_key in cats:
783
- cat = DEEP_INTEL_CATEGORIES.get(cat_key)
784
- if not cat:
785
- print(f"[SKIP] Unknown category: {cat_key}")
786
- continue
787
-
788
- section = IntelSection(category=cat_key, label=cat["label"])
789
- self._research_category(section, cat, use_external_llm, delay, max_results)
790
- self.sections[cat_key] = section
791
-
792
- report_path = self._generate_report(use_external_llm)
793
- self._save_data()
794
-
795
- print(f"\n{'='*60}")
796
- print(f"Report: {report_path}")
797
- total_findings = sum(len(s.findings) for s in self.sections.values())
798
- total_gaps = sum(len(s.gaps) for s in self.sections.values())
799
- print(f"Findings: {total_findings} | Gaps: {total_gaps}")
800
- print(f"{'='*60}\n")
801
-
802
- return report_path
803
-
804
- def _research_category(self, section: IntelSection, cat: dict,
805
- use_external_llm: bool, delay: float, max_results: int):
806
- """Research a single category: generate queries, search, analyze.
807
-
808
- Built-in analysis (source scoring, dedup, gap detection) always runs.
809
- External LLM (OpenRouter) is an optional enhancement on top.
810
- """
811
- print(f"\n--- {section.label} ---")
812
-
813
- queries = [q.format(company=self.company) for q in cat["queries"]]
814
-
815
- # External LLM can generate additional targeted queries
816
- if use_external_llm and self.llm.enabled:
817
- extra = self.llm.generate_category_queries(self.company, section.label)
818
- if extra:
819
- queries.extend(extra)
820
- print(f" [EXTERNAL LLM] +{len(extra)} additional queries")
821
-
822
- for query in queries:
823
- print(f" [SEARCH] {query}")
824
- try:
825
- results = self.search.search(query, max_results=max_results, save=True)
826
- section.queries_executed.append(query)
827
- section.results.extend(results)
828
- for r in results:
829
- if r.url and r.url not in section.sources:
830
- section.sources.append(r.url)
831
- print(f" -> {len(results)} results")
832
- except Exception as e:
833
- print(f" -> Error: {e}")
834
-
835
- if delay > 0:
836
- time.sleep(delay)
837
-
838
- # Always run built-in analysis (no external API needed)
839
- section.findings = self._analyze_section(section)
840
- section.gaps = self._detect_gaps(section)
841
-
842
- confirmed = sum(1 for f in section.findings if f.get("confirmed"))
843
- speculative = len(section.findings) - confirmed
844
- print(f" [ANALYSIS] {len(section.findings)} findings ({confirmed} confirmed, {speculative} speculative)")
845
- if section.gaps:
846
- print(f" [GAPS] {len(section.gaps)}: {', '.join(g['text'] for g in section.gaps[:3])}")
847
-
848
- # LLM synthesis — automatic when OpenRouter is available
849
- if self.llm.enabled and section.results:
850
- print(f" [SYNTHESIS] Synthesizing {section.label}...")
851
- synthesis = self.llm.synthesize_intel(
852
- self.company, section.category, section.label, section.results
853
- )
854
- synth_findings = synthesis.get("findings", [])
855
- synth_gaps = synthesis.get("gaps", [])
856
-
857
- if synth_findings:
858
- # Synthesized findings are distilled answers — use them as primary.
859
- # Append any built-in findings from sources the LLM missed.
860
- synth_sources = {f.get("source", "") for f in synth_findings if f.get("source")}
861
- for bf in section.findings:
862
- if bf.get("source") and bf["source"] not in synth_sources:
863
- synth_findings.append(bf)
864
- section.findings = synth_findings
865
-
866
- for f in synth_findings:
867
- if isinstance(f, dict):
868
- tag = "[CONFIRMED]" if f.get("confirmed") else "[SPECULATIVE]"
869
- print(f" {tag} {f.get('text', '')[:80]}")
870
-
871
- # Merge gaps from synthesis with built-in gaps
872
- existing_gaps = {g["text"].lower() for g in section.gaps}
873
- for sg in synth_gaps:
874
- gap_text = sg.get("text", sg) if isinstance(sg, dict) else sg
875
- if gap_text.lower() not in existing_gaps:
876
- section.gaps.append({"text": gap_text})
877
-
878
- def _analyze_section(self, section: IntelSection) -> list[dict]:
879
- """Built-in smart analysis: source scoring, dedup, structured extraction.
880
-
881
- This runs without any external LLM. It:
882
- 1. Filters results to those mentioning the company
883
- 2. Scores each source as confirmed (primary) or speculative (secondary)
884
- 3. Deduplicates by title similarity
885
- 4. Returns structured findings capped at 15 per section
886
- """
887
- findings = []
888
- seen_keys = set()
889
- aliases = self._get_aliases()
890
-
891
- for r in section.results:
892
- text_lower = f"{r.title} {r.snippet}".lower()
893
-
894
- # Only include results that mention the company
895
- if not any(alias in text_lower for alias in aliases):
896
- continue
897
-
898
- # Deduplicate by normalized title prefix
899
- dedup_key = re.sub(r'[^a-z0-9]', '', r.title.lower()[:50])
900
- if dedup_key in seen_keys:
901
- continue
902
- seen_keys.add(dedup_key)
903
-
904
- # Score source quality
905
- confirmed = self._is_primary_source(r.url)
906
-
907
- # Clean finding text
908
- title = r.title.strip()
909
- snippet = r.snippet.strip()[:250]
910
- finding_text = f"{title}: {snippet}" if snippet else title
911
-
912
- findings.append({
913
- "text": finding_text,
914
- "source": r.url,
915
- "confirmed": confirmed,
916
- })
917
-
918
- # Sort: confirmed first, then by text length (richer content first)
919
- findings.sort(key=lambda f: (not f["confirmed"], -len(f["text"])))
920
- return findings[:15]
921
-
922
- def _is_primary_source(self, url: str) -> bool:
923
- """Score whether a URL is a primary/authoritative source.
924
-
925
- Primary = company's own site, regulatory filings, financial databases,
926
- established industry publications, review platforms.
927
- """
928
- if not url:
929
- return False
930
- url_lower = url.lower()
931
-
932
- # Check known primary domains
933
- for domain in PRIMARY_SOURCE_DOMAINS:
934
- if domain in url_lower:
935
- return True
936
-
937
- # Check if it's the company's own domain
938
- for alias in self._get_aliases():
939
- # Normalize: "ekso bionics" -> "eksobionics", "ekso"
940
- slug = alias.replace(" ", "")
941
- if len(slug) >= 4 and slug in url_lower.split("/")[2] if len(url_lower.split("/")) > 2 else False:
942
- return True
943
-
944
- return False
945
-
946
- def _detect_gaps(self, section: IntelSection) -> list[dict]:
947
- """Detect missing data points for this category.
948
-
949
- Checks findings text against expected patterns per category.
950
- Returns list of gap dicts for fields with no matching data.
951
- """
952
- expected = CATEGORY_EXPECTED.get(section.category, {})
953
- if not expected:
954
- return []
955
-
956
- # Build text corpus from company-relevant results only
957
- aliases = self._get_aliases()
958
- relevant_text = " ".join(
959
- f"{r.title} {r.snippet}"
960
- for r in section.results
961
- if any(a in f"{r.title} {r.snippet}".lower() for a in aliases)
962
- )
963
-
964
- if not relevant_text:
965
- return [{"text": f"No relevant results found for {section.label}"}]
966
-
967
- relevant_lower = relevant_text.lower()
968
- gaps = []
969
- for field_name, patterns in expected.items():
970
- found = any(
971
- re.search(p, relevant_lower, re.IGNORECASE)
972
- for p in patterns
973
- )
974
- if not found:
975
- label = field_name.replace("_", " ").replace("/", " / ")
976
- gaps.append({"text": f"No data found for: {label}"})
977
-
978
- return gaps
979
-
980
- def _get_aliases(self) -> list[str]:
981
- """Get lowercase company aliases for text matching."""
982
- info = COMPANY_DEFINITIONS.get(self.company, {})
983
- aliases = info.get("aliases", [])
984
- if not aliases:
985
- aliases = [self.company.lower()]
986
- return aliases
987
-
988
- def _generate_report(self, use_external_llm: bool) -> Path:
989
- """Generate structured markdown report."""
990
- timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
991
- slug = self.company.lower().replace(" ", "-").replace("/", "-")
992
- slug = "".join(c for c in slug if c.isalnum() or c == "-")
993
-
994
- report_path = self.output_dir / f"{timestamp}_{slug}_intel.md"
995
-
996
- method = "Built-in analysis"
997
- if LLM_ENABLED:
998
- method += " + LLM synthesis (OpenRouter)"
999
- if use_external_llm:
1000
- method += " + extra query generation"
1001
-
1002
- lines = [
1003
- f"# Competitive Intelligence: {self.company}",
1004
- "",
1005
- f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')} ",
1006
- f"**Method:** {method} ",
1007
- f"**Searches:** {sum(len(s.queries_executed) for s in self.sections.values())} ",
1008
- f"**Sources:** {sum(len(s.sources) for s in self.sections.values())} unique URLs",
1009
- "",
1010
- "> **Legend:** [CONFIRMED] = from primary/verified source | [SPECULATIVE] = inferred or unverified",
1011
- "",
1012
- "---",
1013
- ]
1014
-
1015
- for section in self.sections.values():
1016
- lines.append("")
1017
- lines.append(f"## {section.label}")
1018
- lines.append("")
1019
-
1020
- if not section.findings:
1021
- lines.append("*No findings. Try broader queries or `--external-llm` for additional analysis.*")
1022
- lines.append("")
1023
- continue
1024
-
1025
- for f in section.findings:
1026
- if isinstance(f, dict):
1027
- tag = "[CONFIRMED]" if f.get("confirmed") else "[SPECULATIVE]"
1028
- text = f.get("text", "")
1029
- source = f.get("source", "")
1030
- lines.append(f"- **{tag}** {text}")
1031
- if source:
1032
- lines.append(f" - Source: {source}")
1033
- else:
1034
- lines.append(f"- {f}")
1035
-
1036
- if section.gaps:
1037
- lines.append("")
1038
- lines.append("**Knowledge Gaps:**")
1039
- for gap in section.gaps:
1040
- gap_text = gap.get("text", gap) if isinstance(gap, dict) else gap
1041
- lines.append(f"- [ ] {gap_text}")
1042
-
1043
- lines.append("")
1044
-
1045
- if section.sources:
1046
- lines.append(f"<details><summary>Sources ({len(section.sources)} URLs)</summary>")
1047
- lines.append("")
1048
- for url in section.sources[:10]:
1049
- lines.append(f"- {url}")
1050
- if len(section.sources) > 10:
1051
- lines.append(f"- ... and {len(section.sources) - 10} more")
1052
- lines.append("")
1053
- lines.append("</details>")
1054
- lines.append("")
1055
-
1056
- # Summary
1057
- lines.extend(["---", "", "## Summary", ""])
1058
-
1059
- total_findings = sum(len(s.findings) for s in self.sections.values())
1060
- confirmed = sum(
1061
- sum(1 for f in s.findings if isinstance(f, dict) and f.get("confirmed"))
1062
- for s in self.sections.values()
1063
- )
1064
- speculative = total_findings - confirmed
1065
-
1066
- lines.append(f"| Metric | Count |")
1067
- lines.append(f"|--------|-------|")
1068
- lines.append(f"| Total findings | {total_findings} |")
1069
- lines.append(f"| Confirmed | {confirmed} |")
1070
- lines.append(f"| Speculative | {speculative} |")
1071
- lines.append(f"| Categories | {len(self.sections)} |")
1072
- lines.append("")
1073
-
1074
- all_gaps = []
1075
- for s in self.sections.values():
1076
- for g in s.gaps:
1077
- gap_text = g.get("text", g) if isinstance(g, dict) else g
1078
- all_gaps.append(f"{s.label}: {gap_text}")
1079
-
1080
- if all_gaps:
1081
- lines.append("### Outstanding Gaps")
1082
- lines.append("")
1083
- for gap in all_gaps:
1084
- lines.append(f"- [ ] {gap}")
1085
- lines.append("")
1086
-
1087
- with open(report_path, "w") as f:
1088
- f.write("\n".join(lines))
1089
-
1090
- return report_path
1091
-
1092
- def _save_data(self):
1093
- """Save structured intel data as JSON alongside the report."""
1094
- timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
1095
- slug = self.company.lower().replace(" ", "-").replace("/", "-")
1096
- slug = "".join(c for c in slug if c.isalnum() or c == "-")
1097
-
1098
- data = {
1099
- "company": self.company,
1100
- "generated": datetime.now().isoformat(),
1101
- "sections": {},
1102
- }
1103
-
1104
- for cat_key, section in self.sections.items():
1105
- data["sections"][cat_key] = {
1106
- "label": section.label,
1107
- "queries_executed": section.queries_executed,
1108
- "finding_count": len(section.findings),
1109
- "findings": section.findings,
1110
- "gaps": section.gaps,
1111
- "source_count": len(section.sources),
1112
- "sources": section.sources[:20],
1113
- }
1114
-
1115
- json_path = self.output_dir / f"{timestamp}_{slug}_intel.json"
1116
- with open(json_path, "w") as f:
1117
- json.dump(data, f, indent=2)
1118
-
1119
-
1120
- # ============================================================
1121
- # Competitor Extraction (Data Processing)
1122
- # ============================================================
1123
-
1124
- # Company definitions for extraction
1125
- COMPANY_DEFINITIONS = {
1126
- "Hocoma": {"aliases": ["hocoma", "dih", "lokomat"], "country": "Switzerland", "product": "Lokomat", "status": "collapsed", "verified": True},
1127
- "Ekso Bionics": {"aliases": ["ekso", "eksobionics", "eksonr"], "country": "USA", "product": "EksoNR", "status": "weak", "verified": True},
1128
- "Cyberdyne": {"aliases": ["cyberdyne", "hal exoskeleton"], "country": "Japan", "product": "HAL", "status": "strong", "verified": True},
1129
- "Lifeward": {"aliases": ["lifeward", "rewalk", "alterg"], "country": "Israel/USA", "product": "ReWalk 7", "status": "consolidating", "verified": True},
1130
- "Fourier": {"aliases": ["fourier", "fourier intelligence"], "country": "China", "product": "X1, M2", "status": "growing", "verified": True},
1131
- "Myomo": {"aliases": ["myomo", "myopro"], "country": "USA", "product": "MyoPro", "status": "stable", "verified": False},
1132
- "Bionik": {"aliases": ["bionik", "inmotion"], "country": "Canada", "product": "InMotion", "status": "stable", "verified": False},
1133
- "Wandercraft": {"aliases": ["wandercraft", "atalante"], "country": "France", "product": "Atalante X", "status": "growing", "verified": False},
1134
- }
1135
-
1136
- # Status detection keywords (order matters)
1137
- STATUS_KEYWORDS = [
1138
- ("collapsed", ["bankrupt", "delisted", "suspended", "collapse", "shut down", "ceased", "nasdaq delisted"]),
1139
- ("weak", ["52-week low", "struggling", "losses", "declining", "layoffs"]),
1140
- ("growing", ["series e", "series d", "series c", "funding round", "$109 million"]),
1141
- ("consolidating", ["acquired", "merger", "acquisition"]),
1142
- ("strong", ["leader", "dominant", "profitable"]),
1143
- ]
1144
-
1145
- # Extraction patterns
1146
- DATE_PATTERN = re.compile(
1147
- r'((?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4})'
1148
- r'|(\d{4}-\d{2}-\d{2})'
1149
- r'|(\d{4}-\d{2})'
1150
- r'|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4})'
1151
- )
1152
- MONEY_PATTERN = re.compile(r'\$[\d,]+(?:\.\d+)?(?:\s*(?:million|billion|M|B))?|\d+(?:\.\d+)?\s*(?:million|billion)', re.IGNORECASE)
1153
-
1154
-
1155
- class CompetitorExtractor:
1156
- """Extract structured competitor data from research results."""
1157
-
1158
- def __init__(self, research_dir: Path = RESEARCH_DIR,
1159
- output_file: Path = None):
1160
- self.research_dir = research_dir
1161
- self.output_file = output_file or (DATA_DIR / "competitors.json")
1162
-
1163
- def load_research_files(self) -> list[dict]:
1164
- """Load all JSON research files."""
1165
- results = []
1166
- if not self.research_dir.exists():
1167
- return results
1168
-
1169
- for json_file in self.research_dir.glob("*.json"):
1170
- if json_file.name.startswith("."):
1171
- continue
1172
- try:
1173
- with open(json_file) as f:
1174
- data = json.load(f)
1175
- data["_source_file"] = json_file.name
1176
- results.append(data)
1177
- except Exception as e:
1178
- print(f"Error loading {json_file}: {e}")
1179
-
1180
- return results
1181
-
1182
- def find_mentions(self, text: str) -> list[str]:
1183
- """Find which companies are mentioned in text."""
1184
- text_lower = text.lower()
1185
- mentioned = []
1186
- for company, info in COMPANY_DEFINITIONS.items():
1187
- if any(alias in text_lower for alias in info["aliases"]):
1188
- mentioned.append(company)
1189
- return mentioned
1190
-
1191
- def normalize_date(self, date_str: str) -> str | None:
1192
- """Normalize date string to YYYY-MM-DD. Returns None for bogus dates."""
1193
- formats = ["%B %d, %Y", "%B %d %Y", "%b %d, %Y", "%b %d %Y", "%Y-%m-%d", "%Y-%m"]
1194
- for fmt in formats:
1195
- try:
1196
- dt = datetime.strptime(date_str.strip(), fmt)
1197
- if dt.year < 2010:
1198
- return None
1199
- return dt.strftime("%Y-%m-%d")
1200
- except:
1201
- pass
1202
- return date_str
1203
-
1204
- def extract_events(self, text: str, company: str) -> list[dict]:
1205
- """Extract events (date + context) from text."""
1206
- events = []
1207
- aliases = COMPANY_DEFINITIONS[company]["aliases"]
1208
-
1209
- for match in DATE_PATTERN.finditer(text):
1210
- date_str = match.group(0)
1211
- if not date_str:
1212
- continue
1213
-
1214
- start = max(0, match.start() - 50)
1215
- end = min(len(text), match.end() + 150)
1216
- context = text[start:end]
1217
-
1218
- normalized = self.normalize_date(date_str)
1219
- if normalized and any(alias in context.lower() for alias in aliases):
1220
- events.append({
1221
- "date": normalized,
1222
- "context": context.strip()
1223
- })
1224
-
1225
- return events
1226
-
1227
- def detect_status(self, snippets: list[str], default: str) -> str:
1228
- """Detect status from snippets."""
1229
- text = " ".join(snippets).lower()
1230
- for status, keywords in STATUS_KEYWORDS:
1231
- if any(kw.lower() in text for kw in keywords):
1232
- return status
1233
- return default
1234
-
1235
- def extract_stock(self, snippets: list[str]) -> Optional[float]:
1236
- """Extract stock price."""
1237
- for snippet in snippets:
1238
- match = re.search(r'\$(\d+\.?\d*)', snippet)
1239
- if match and float(match.group(1)) < 1000:
1240
- return float(match.group(1))
1241
- return None
1242
-
1243
- def extract_funding(self, money_mentions: list[str]) -> Optional[int]:
1244
- """Extract funding amount."""
1245
- for m in money_mentions:
1246
- match = re.search(r'(\d+)\s*(?:million|M)', m, re.IGNORECASE)
1247
- if match:
1248
- return int(match.group(1)) * 1_000_000
1249
- match = re.search(r'(\d+\.?\d*)\s*(?:billion|B)', m, re.IGNORECASE)
1250
- if match:
1251
- return int(float(match.group(1)) * 1_000_000_000)
1252
- return None
1253
-
1254
- def _load_intel_findings(self) -> dict[str, list[dict]]:
1255
- """Load confirmed findings from Deep Intel reports, grouped by company.
1256
-
1257
- Returns: {company_name: [{"text": ..., "confirmed": bool, "source": ...}, ...]}
1258
- Only loads the latest report per company.
1259
- """
1260
- intel_dir = self.research_dir / "intel"
1261
- if not intel_dir.exists():
1262
- return {}
1263
-
1264
- findings_by_company: dict[str, list[dict]] = {}
1265
- seen_companies: set[str] = set()
1266
-
1267
- for json_file in sorted(intel_dir.glob("*_intel.json"), reverse=True):
1268
- try:
1269
- with open(json_file) as f:
1270
- data = json.load(f)
1271
- company = data.get("company", "")
1272
- if not company or company in seen_companies:
1273
- continue
1274
- seen_companies.add(company)
1275
-
1276
- all_findings = []
1277
- for section in data.get("sections", {}).values():
1278
- for finding in section.get("findings", []):
1279
- if isinstance(finding, dict) and finding.get("text"):
1280
- all_findings.append(finding)
1281
- elif isinstance(finding, str) and finding:
1282
- all_findings.append({"text": finding, "confirmed": False, "source": ""})
1283
-
1284
- if all_findings:
1285
- findings_by_company[company] = all_findings
1286
- except Exception:
1287
- pass
1288
-
1289
- return findings_by_company
1290
-
1291
- def _extract_intel_opportunities(self, intel_findings: dict[str, list[dict]]) -> list[dict]:
1292
- """Extract opportunity signals from Deep Intel confirmed findings.
1293
-
1294
- Scans for: vulnerability keywords, financial weakness, market gaps,
1295
- technology gaps, regulatory issues, customer complaints.
1296
- """
1297
- opportunities = []
1298
-
1299
- vuln_patterns = [
1300
- (r'(?:layoff|restructur|downsiz|headcount.?reduc)', "workforce_cut", 2),
1301
- (r'(?:delist|stock.?(?:drop|fall|declin)|52.week.low|penny.stock)', "financial_distress", 1),
1302
- (r'(?:FDA.?(?:reject|warning|recall)|regulatory.?(?:issue|fail|delay))', "regulatory_issue", 2),
1303
- (r'(?:bankrupt|insolvenc|cease.?operat|wind.?down|liquidat)', "collapse", 1),
1304
- (r'(?:customer.?complain|negative.?review|churn|losing.?customer)', "customer_risk", 2),
1305
- (r'(?:legacy|technical.?debt|outdated|proprietary.?lock)', "tech_weakness", 3),
1306
- (r'(?:no.?AI|lack.?(?:of.?)?(?:data|machine.learn|personali))', "ai_gap", 2),
1307
- ]
1308
-
1309
- for company, findings in intel_findings.items():
1310
- confirmed = [f for f in findings if f.get("confirmed")]
1311
- all_text = " ".join(f["text"] for f in confirmed).lower() if confirmed else ""
1312
- all_text_full = " ".join(f["text"] for f in findings).lower()
1313
-
1314
- for pattern, opp_type, priority in vuln_patterns:
1315
- # Check confirmed findings first (higher weight)
1316
- if re.search(pattern, all_text, re.IGNORECASE):
1317
- match_finding = next(
1318
- (f for f in confirmed if re.search(pattern, f["text"], re.IGNORECASE)),
1319
- None
1320
- )
1321
- if match_finding:
1322
- opportunities.append({
1323
- "type": opp_type,
1324
- "text": f"{company}: {match_finding['text'][:120]}",
1325
- "priority": priority,
1326
- "confirmed": True,
1327
- "source": match_finding.get("source", ""),
1328
- "company": company,
1329
- })
1330
- # Then speculative
1331
- elif re.search(pattern, all_text_full, re.IGNORECASE):
1332
- match_finding = next(
1333
- (f for f in findings if re.search(pattern, f["text"], re.IGNORECASE)),
1334
- None
1335
- )
1336
- if match_finding:
1337
- opportunities.append({
1338
- "type": opp_type,
1339
- "text": f"{company}: {match_finding['text'][:120]}",
1340
- "priority": priority + 1,
1341
- "confirmed": False,
1342
- "source": match_finding.get("source", ""),
1343
- "company": company,
1344
- })
1345
-
1346
- return opportunities
1347
-
1348
- def _load_sota_tech_signals(self) -> list[dict]:
1349
- """Load tech advantage signals from SOTA knowledge base.
1350
-
1351
- Reads data/sota/knowledge_base.json, extracts high-priority
1352
- techniques and key principles as opportunity items.
1353
- """
1354
- kb_path = self.research_dir / "sota" / "knowledge_base.json"
1355
- if not kb_path.exists():
1356
- return []
1357
-
1358
- try:
1359
- with open(kb_path) as f:
1360
- kb = json.load(f)
1361
- except Exception:
1362
- return []
1363
-
1364
- signals = []
1365
-
1366
- # High-priority techniques → tech advantages (priority 2)
1367
- for t in kb.get("techniques", []):
1368
- if t.get("priority") == "high" and t.get("gurma_fit"):
1369
- signals.append({
1370
- "type": "tech_advantage",
1371
- "text": f"{t['name']}: {t['gurma_fit'][:120]}",
1372
- "priority": 2,
1373
- "confirmed": True,
1374
- "company": "GURMA",
1375
- })
1376
-
1377
- # Key principles → strategic validation (priority 3, capped at 2)
1378
- for p in kb.get("key_principles", [])[:2]:
1379
- if p.get("principle"):
1380
- signals.append({
1381
- "type": "tech_principle",
1382
- "text": f"{p['principle']}: {p.get('detail', '')[:100]}",
1383
- "priority": 3,
1384
- "confirmed": True,
1385
- "company": "GURMA",
1386
- })
1387
-
1388
- return signals
1389
-
1390
- def _opportunity_changed(self, new_opps: list[dict], existing: dict) -> bool:
1391
- """Decide whether new opportunity data is materially different from existing.
1392
-
1393
- Returns True if the widget deserves an update. Criteria:
1394
- - New opportunity types appeared (e.g. a collapse that wasn't there)
1395
- - Priority-1 signals changed
1396
- - >50% of top points are different
1397
- """
1398
- existing_points = set(existing.get("points", []))
1399
- new_points = set(o["text"] for o in new_opps[:4])
1400
-
1401
- if not existing_points:
1402
- return True
1403
-
1404
- # Check if any priority-1 signals are new
1405
- new_p1_types = {o["type"] for o in new_opps if o["priority"] == 1}
1406
- old_raw = existing.get("raw_opportunities", [])
1407
- old_p1_types = {o["type"] for o in old_raw if o.get("priority") == 1}
1408
- if new_p1_types != old_p1_types:
1409
- return True
1410
-
1411
- # Check overlap of top points — if less than half match, it's a meaningful change
1412
- overlap = existing_points & new_points
1413
- if len(overlap) < len(existing_points) / 2:
1414
- return True
1415
-
1416
- return False
1417
-
1418
- def _synthesize_opportunity_llm(self, opportunities: list[dict],
1419
- competitors: list[dict]) -> Optional[dict]:
1420
- """Use LLM to synthesize a strategic opportunity headline + points.
1421
-
1422
- Returns {headline, points} or None if LLM unavailable/fails.
1423
- """
1424
- if not LLM_ENABLED:
1425
- return None
1426
-
1427
- llm = LLMClient()
1428
-
1429
- opp_text = "\n".join(
1430
- f"- [{o['type']}] {'[CONFIRMED]' if o.get('confirmed') else '[SPECULATIVE]'} {o['text']}"
1431
- for o in opportunities[:12]
1432
- )
1433
-
1434
- comp_summary = "\n".join(
1435
- f"- {c['name']}: status={c['status']}, "
1436
- f"{'stock=$'+format(c['stock'], '.2f') if c.get('stock') else 'no stock data'}, "
1437
- f"{'funding=$'+format(c['funding']/1e6, '.0f')+'M' if c.get('funding') else 'no funding data'}"
1438
- for c in competitors[:8]
1439
- )
1440
-
1441
- system = (
1442
- "You are a strategic advisor for GURMA.ai, a Swiss AI company "
1443
- "entering rehabilitation robotics with 15 years of patient outcome "
1444
- "data (not just motion data) from BAMA Teknoloji. "
1445
- "You produce concise, actionable strategic assessments."
1446
- )
1447
-
1448
- prompt = f"""Based on the following competitive + technology signals and competitor data,
1449
- produce a strategic opportunity assessment for GURMA.ai.
1450
-
1451
- Signals (competitive, tech advantages, and threats):
1452
- {opp_text}
1453
-
1454
- Competitor landscape:
1455
- {comp_summary}
1456
-
1457
- Return JSON:
1458
- {{
1459
- "headline": "One punchy sentence (max 10 words) summarizing the #1 strategic opportunity",
1460
- "points": [
1461
- "Actionable insight 1 (max 20 words, include numbers where available)",
1462
- "Actionable insight 2",
1463
- "Actionable insight 3",
1464
- "Actionable insight 4"
1465
- ]
1466
- }}
1467
-
1468
- Rules:
1469
- - Headline should be about the OPPORTUNITY, not just a competitor's problem
1470
- - Points should mix competitive windows, tech advantages, AND threats
1471
- - Be specific: include dollar amounts, dates, competitor names, model/technique names
1472
- - Maximum 4 points, ranked by strategic importance
1473
- - confirmed signals should be weighted more heavily than speculative ones"""
1474
-
1475
- response = llm.call(prompt, system, max_tokens=500)
1476
- if response:
1477
- match = re.search(r'\{.*\}', response, re.DOTALL)
1478
- if match:
1479
- try:
1480
- result = json.loads(match.group())
1481
- if result.get("headline") and result.get("points"):
1482
- return result
1483
- except Exception:
1484
- pass
1485
- return None
1486
-
1487
- def detect_opportunities(self, competitors: list[dict], all_snippets: list[str]) -> dict:
1488
- """
1489
- Detect market opportunities from competitor data + Deep Intel findings.
1490
-
1491
- Combines: structured competitor status, Deep Intel confirmed findings,
1492
- and optionally LLM synthesis for headline/points.
1493
- """
1494
- opportunities = []
1495
-
1496
- # --- Source 1: Structured competitor status (always available) ---
1497
- collapsed = [c for c in competitors if c["status"] == "collapsed"]
1498
- weak = [c for c in competitors if c["status"] == "weak"]
1499
-
1500
- if collapsed:
1501
- names = ", ".join(c["name"] for c in collapsed)
1502
- opportunities.append({
1503
- "type": "market_gap",
1504
- "text": f"{names} collapsed — customers seeking alternatives",
1505
- "priority": 1, "confirmed": True, "company": names,
1506
- })
1507
-
1508
- if weak:
1509
- for c in weak:
1510
- opp_text = f"{c['name']} financially weak"
1511
- if c.get("stock"):
1512
- opp_text += f" (${c['stock']:.2f})"
1513
- opp_text += " — vulnerable to disruption"
1514
- opportunities.append({
1515
- "type": "weakness",
1516
- "text": opp_text,
1517
- "priority": 2, "confirmed": True, "company": c["name"],
1518
- })
1519
-
1520
- growing = [c for c in competitors if c["status"] == "growing" and c.get("funding")]
1521
- for c in growing:
1522
- funding_m = c["funding"] / 1_000_000
1523
- opportunities.append({
1524
- "type": "threat",
1525
- "text": f"{c['name']} well-funded (${funding_m:.0f}M) — monitor closely",
1526
- "priority": 3, "confirmed": True, "company": c["name"],
1527
- })
1528
-
1529
- # BAMA data advantage
1530
- if competitors:
1531
- opportunities.append({
1532
- "type": "advantage",
1533
- "text": "BAMA has 15 years outcome data vs. competitors' motion data",
1534
- "priority": 1, "confirmed": True, "company": "BAMA",
1535
- })
1536
-
1537
- # --- Source 2: Deep Intel findings (if available) ---
1538
- intel_findings = self._load_intel_findings()
1539
- if intel_findings:
1540
- intel_opps = self._extract_intel_opportunities(intel_findings)
1541
- existing_keys = {(o.get("company", ""), o["type"]) for o in opportunities}
1542
- for io in intel_opps:
1543
- key = (io.get("company", ""), io["type"])
1544
- if key not in existing_keys:
1545
- opportunities.append(io)
1546
- existing_keys.add(key)
1547
-
1548
- # --- Source 3: SOTA KB tech signals (if available) ---
1549
- sota_signals = self._load_sota_tech_signals()
1550
- if sota_signals:
1551
- existing_keys = {(o.get("company", ""), o["type"]) for o in opportunities}
1552
- for ts in sota_signals:
1553
- key = (ts.get("company", ""), ts["type"])
1554
- if key not in existing_keys:
1555
- opportunities.append(ts)
1556
- existing_keys.add(key)
1557
-
1558
- # Sort by priority
1559
- opportunities.sort(key=lambda x: x["priority"])
1560
-
1561
- # --- Headline + points: LLM synthesis or rule-based fallback ---
1562
- llm_result = self._synthesize_opportunity_llm(opportunities, competitors)
1563
-
1564
- if llm_result:
1565
- headline = llm_result["headline"]
1566
- points = llm_result["points"][:4]
1567
- else:
1568
- if collapsed:
1569
- headline = f"{collapsed[0]['name']} collapse creates market window"
1570
- elif weak:
1571
- headline = "Competitor weakness creates opportunity"
1572
- else:
1573
- headline = "Data advantage positions GURMA.ai for growth"
1574
- points = [o["text"] for o in opportunities[:4]]
1575
-
1576
- # Build sources list
1577
- sources = ["competitor"]
1578
- if intel_findings:
1579
- sources.append("intel")
1580
- if sota_signals:
1581
- sources.append("tech")
1582
- if llm_result:
1583
- sources.append("llm")
1584
-
1585
- return {
1586
- "headline": headline,
1587
- "points": points,
1588
- "detected_at": datetime.now().strftime("%Y-%m-%d"),
1589
- "raw_opportunities": opportunities,
1590
- "sources": sources,
1591
- }
1592
-
1593
- def load_existing_data(self) -> Optional[dict]:
1594
- """Load existing competitors.json if it exists."""
1595
- if self.output_file.exists():
1596
- try:
1597
- with open(self.output_file) as f:
1598
- return json.load(f)
1599
- except:
1600
- pass
1601
- return None
1602
-
1603
- def process(self) -> dict:
1604
- """Process research files and build competitors.json."""
1605
- research_data = self.load_research_files()
1606
- if not research_data:
1607
- return {"competitors": [], "market": {}}
1608
-
1609
- # Aggregate data per company
1610
- from collections import defaultdict
1611
- company_data = defaultdict(lambda: {
1612
- "mentions": 0, "snippets": [], "events": [], "money": [], "urls": []
1613
- })
1614
-
1615
- for research in research_data:
1616
- for result in research.get("results", []):
1617
- text = f"{result.get('title', '')} {result.get('snippet', '')}"
1618
- url = result.get("url", "")
1619
-
1620
- for company in self.find_mentions(text):
1621
- cd = company_data[company]
1622
- cd["mentions"] += 1
1623
- cd["snippets"].append(result.get("snippet", "")[:200])
1624
- cd["urls"].append(url)
1625
- cd["events"].extend(self.extract_events(text, company))
1626
- cd["money"].extend(MONEY_PATTERN.findall(text))
1627
-
1628
- # Build output
1629
- competitors = []
1630
- for company, info in COMPANY_DEFINITIONS.items():
1631
- data = company_data[company]
1632
-
1633
- status = info["status"] if info.get("verified") else self.detect_status(data["snippets"], info["status"])
1634
-
1635
- competitors.append({
1636
- "name": company,
1637
- "country": info["country"],
1638
- "product": info["product"],
1639
- "status": status,
1640
- "stock": self.extract_stock(data["snippets"]),
1641
- "funding": self.extract_funding(data["money"]),
1642
- "notes": data["snippets"][0] if data["snippets"] else "",
1643
- "mentions": data["mentions"],
1644
- "events": [{"date": e["date"], "event": e["context"][:100]} for e in data["events"][:10]],
1645
- "sample_urls": list(set(data["urls"]))[:5],
1646
- })
1647
-
1648
- competitors.sort(key=lambda x: x["mentions"], reverse=True)
1649
-
1650
- # Detect opportunities (from competitor status + Deep Intel findings)
1651
- all_snippets = []
1652
- for company, data in company_data.items():
1653
- all_snippets.extend(data["snippets"])
1654
- new_opportunity = self.detect_opportunities(competitors, all_snippets)
1655
-
1656
- # Decide whether to update the widget
1657
- existing = self.load_existing_data()
1658
- existing_opp = existing.get("opportunity", {}) if existing else {}
1659
-
1660
- if existing_opp.get("confirmed"):
1661
- # Confirmed: only flag update if data materially changed
1662
- if self._opportunity_changed(new_opportunity.get("raw_opportunities", []), existing_opp):
1663
- opportunity = existing_opp
1664
- opportunity["update_available"] = True
1665
- opportunity["suggested_update"] = new_opportunity
1666
- else:
1667
- opportunity = existing_opp
1668
- opportunity["update_available"] = False
1669
- else:
1670
- # Not confirmed: auto-update
1671
- opportunity = new_opportunity
1672
- opportunity["confirmed"] = False
1673
- opportunity["update_available"] = False
1674
-
1675
- return {
1676
- "competitors": competitors,
1677
- "market": {"size_2024": 2_000_000_000, "size_2029_ai": 9_100_000_000, "cagr": 0.278},
1678
- "opportunity": opportunity,
1679
- "_generated": datetime.now().isoformat(),
1680
- "_source_files": [f.name for f in self.research_dir.glob("*.json") if not f.name.startswith(".")]
1681
- }
1682
-
1683
- def save(self, data: dict = None) -> Path:
1684
- """Process and save to output file."""
1685
- data = data or self.process()
1686
- self.output_file.parent.mkdir(parents=True, exist_ok=True)
1687
- with open(self.output_file, "w") as f:
1688
- json.dump(data, f, indent=2)
1689
- return self.output_file
1690
-
1691
-
1692
- # ============================================================
1693
- # CLI Commands
1694
- # ============================================================
1695
-
1696
- def cmd_extract(args):
1697
- """Extract competitor data from research."""
1698
- extractor = CompetitorExtractor()
1699
-
1700
- print(f"Loading research from: {extractor.research_dir}")
1701
- data = extractor.process()
1702
-
1703
- if not data["competitors"]:
1704
- print("No research files found. Run 'batch' first.")
1705
- return
1706
-
1707
- output = extractor.save(data)
1708
-
1709
- print(f"Saved to: {output}")
1710
- print(f"\nCompany mentions:")
1711
- for comp in data["competitors"]:
1712
- status_marker = {"collapsed": "⚠", "weak": "↓", "growing": "↑", "strong": "★"}.get(comp["status"], "•")
1713
- print(f" {status_marker} {comp['name']}: {comp['mentions']} mentions ({comp['status']})")
1714
-
1715
-
1716
- def cmd_search(args):
1717
- """Single search command."""
1718
- service = SearchService(backend=args.backend)
1719
- print(f"Searching: {args.query}")
1720
- print(f"Backend: {args.backend} | Max: {args.max_results}")
1721
- print("-" * 50)
1722
-
1723
- results = service.search(args.query, args.max_results, save=args.save)
1724
-
1725
- for i, r in enumerate(results, 1):
1726
- print(f"\n{i}. {r.title}")
1727
- print(f" {r.url}")
1728
- print(f" {r.snippet[:150]}...")
1729
-
1730
- print(f"\n[{len(results)} results]")
1731
- if args.save:
1732
- print(f"Saved to: {RESEARCH_DIR}")
1733
-
1734
-
1735
- def cmd_batch(args):
1736
- """Batch research command."""
1737
- service = SearchService(backend=args.backend)
1738
- storage = ResultStorage()
1739
-
1740
- # Generate all queries
1741
- queries = []
1742
- for company in COMPETITORS:
1743
- for template in BATCH_QUERY_TEMPLATES:
1744
- queries.append(template.format(company=company))
1745
- queries.extend(MARKET_QUERIES)
1746
-
1747
- total_queries = len(queries)
1748
-
1749
- # Deduplicate unless --force is set
1750
- skipped = 0
1751
- if not args.force:
1752
- recent = storage.get_recent_queries(days=args.days)
1753
- original_count = len(queries)
1754
- queries = [q for q in queries if q.lower().strip() not in recent]
1755
- skipped = original_count - len(queries)
1756
-
1757
- print(f"Batch Research")
1758
- print(f"{'='*60}")
1759
- print(f"Competitors: {len(COMPETITORS)}")
1760
- print(f"Total queries: {total_queries}")
1761
- if skipped > 0:
1762
- print(f"Skipped (run in last {args.days} days): {skipped}")
1763
- print(f"New queries to run: {len(queries)}")
1764
- print(f"Output: {RESEARCH_DIR}")
1765
- print(f"{'='*60}")
1766
-
1767
- if not queries:
1768
- print("\nNo new queries to run. Use --force to re-run all.")
1769
- return
1770
-
1771
- def progress(i, total, query):
1772
- print(f"\n[{i}/{total}] {query}")
1773
-
1774
- stats = service.search_batch(queries, args.max_results, args.delay, callback=progress)
1775
-
1776
- success = sum(1 for v in stats.values() if v >= 0)
1777
- print(f"\n{'='*60}")
1778
- print(f"Complete: {success}/{len(queries)} successful")
1779
- if skipped > 0:
1780
- print(f"Skipped: {skipped} (already run recently)")
1781
- print(f"{'='*60}")
1782
-
1783
-
1784
- def cmd_competitor(args):
1785
- """Deep competitive intelligence on a single competitor."""
1786
- company = args.company
1787
- use_external_llm = args.external_llm
1788
-
1789
- if use_external_llm and not LLM_ENABLED:
1790
- print("Warning: --external-llm requested but OPENROUTER_API_KEY not found. Skipping external LLM.")
1791
- use_external_llm = False
1792
-
1793
- categories = None
1794
- if args.categories:
1795
- categories = [c.strip() for c in args.categories.split(",")]
1796
- valid = set(DEEP_INTEL_CATEGORIES.keys())
1797
- invalid = [c for c in categories if c not in valid]
1798
- if invalid:
1799
- print(f"Invalid categories: {invalid}")
1800
- print(f"Valid: {sorted(valid)}")
1801
- return
1802
-
1803
- if args.list_categories:
1804
- print("Available categories:")
1805
- for key, cat in DEEP_INTEL_CATEGORIES.items():
1806
- q_count = len(cat["queries"])
1807
- print(f" {key:30s} {cat['label']:30s} ({q_count} queries)")
1808
- return
1809
-
1810
- agent = CompetitorIntelAgent(company)
1811
- report_path = agent.run(
1812
- categories=categories,
1813
- use_external_llm=use_external_llm,
1814
- delay=args.delay,
1815
- max_results=args.max_results,
1816
  )
1817
-
1818
- print(f"\nReport: {report_path}")
1819
-
1820
-
1821
- def cmd_sota(args):
1822
- """SOTA technology knowledge base."""
1823
- try:
1824
- from .sota_agent import SOTAScoutAgent
1825
- except ImportError:
1826
- from sota_agent import SOTAScoutAgent
1827
-
1828
- agent = SOTAScoutAgent()
1829
-
1830
- if args.analyze:
1831
- report = agent.analyze(args.analyze)
1832
- print(f"\nAnalysis report: {report}")
1833
- return
1834
-
1835
- # Default: show knowledge base
1836
- agent.show(section=args.show)
1837
-
1838
-
1839
- def cmd_list(args):
1840
- """List saved searches."""
1841
- storage = ResultStorage()
1842
- searches = storage.list_searches(args.limit)
1843
-
1844
- if not searches:
1845
- print(f"No searches in {RESEARCH_DIR}")
1846
- return
1847
-
1848
- print(f"Recent searches ({RESEARCH_DIR}):\n")
1849
- for s in searches:
1850
- print(f" {s['timestamp'][:10]} {s['results']:2d} results {s['query'][:50]}")
1851
-
1852
-
1853
- def main():
1854
- parser = argparse.ArgumentParser(
1855
- description="GURMA.ai Research Tool",
1856
- formatter_class=argparse.RawDescriptionHelpFormatter
1857
  )
1858
- subparsers = parser.add_subparsers(dest="command", help="Commands")
1859
-
1860
- # search
1861
- p_search = subparsers.add_parser("search", help="Single web search")
1862
- p_search.add_argument("query", help="Search query")
1863
- p_search.add_argument("-b", "--backend", default="duckduckgo",
1864
- choices=["duckduckgo", "ddg", "serpapi", "brave"])
1865
- p_search.add_argument("-n", "--max-results", type=int, default=10)
1866
- p_search.add_argument("--no-save", dest="save", action="store_false")
1867
- p_search.set_defaults(func=cmd_search)
1868
-
1869
- # batch
1870
- p_batch = subparsers.add_parser("batch", help="Batch research all competitors")
1871
- p_batch.add_argument("-b", "--backend", default="duckduckgo")
1872
- p_batch.add_argument("-n", "--max-results", type=int, default=10)
1873
- p_batch.add_argument("-d", "--delay", type=float, default=0.5)
1874
- p_batch.add_argument("--days", type=int, default=7,
1875
- help="Skip queries run within N days (default: 7)")
1876
- p_batch.add_argument("-f", "--force", action="store_true",
1877
- help="Force re-run all queries (ignore deduplication)")
1878
- p_batch.set_defaults(func=cmd_batch)
1879
-
1880
- # competitor (deep intel)
1881
- p_comp = subparsers.add_parser("competitor", help="Deep competitive intelligence on a company")
1882
- p_comp.add_argument("company", nargs="?", default="", help="Company name (e.g. 'Ekso Bionics')")
1883
- p_comp.add_argument("--external-llm", action="store_true",
1884
- help="Also use external LLM (OpenRouter) for enhanced analysis")
1885
- p_comp.add_argument("-c", "--categories", type=str, default=None,
1886
- help="Comma-separated categories (default: all)")
1887
- p_comp.add_argument("--list-categories", action="store_true",
1888
- help="List available categories")
1889
- p_comp.add_argument("-n", "--max-results", type=int, default=10)
1890
- p_comp.add_argument("-d", "--delay", type=float, default=1.0,
1891
- help="Delay between searches in seconds (default: 1.0)")
1892
- p_comp.set_defaults(func=cmd_competitor)
1893
-
1894
- # sota
1895
- p_sota = subparsers.add_parser("sota", help="SOTA technology knowledge base for GURMA.ai")
1896
- p_sota.add_argument("--analyze", "-a", type=str, default=None,
1897
- help="Analyze a document and update knowledge base")
1898
- p_sota.add_argument("--show", "-s", type=str, default=None, nargs="?",
1899
- const=None,
1900
- choices=["models", "techniques", "stack", "principles", "actions", "sources"],
1901
- help="Show specific KB section (default: summary)")
1902
- p_sota.set_defaults(func=cmd_sota)
1903
-
1904
- # extract
1905
- p_extract = subparsers.add_parser("extract", help="Extract competitor data to JSON")
1906
- p_extract.set_defaults(func=cmd_extract)
1907
-
1908
- # list
1909
- p_list = subparsers.add_parser("list", help="List saved searches")
1910
- p_list.add_argument("-l", "--limit", type=int, default=20)
1911
- p_list.set_defaults(func=cmd_list)
1912
-
1913
- args = parser.parse_args()
1914
-
1915
- if hasattr(args, "func"):
1916
- args.func(args)
1917
- else:
1918
- parser.print_help()
1919
-
1920
 
1921
  if __name__ == "__main__":
1922
  main()
 
1
  #!/usr/bin/env python3
2
  """
3
+ GURMA.ai Research Tool — backwards-compatible shim.
4
+
5
+ All logic has been split into focused modules:
6
+ config.py — paths, API keys, constants
7
+ search.py — backends, storage, SearchService
8
+ llm.py — LLMClient (OpenRouter)
9
+ intel.py — CompetitorIntelAgent
10
+ extract.py — CompetitorExtractor, COMPANY_DEFINITIONS
11
+ cli.py — CLI commands and argparse
12
+ sota_agent.py SOTA knowledge base agent
13
+ tr_agents.py Turkish research agents
14
+
15
+ This file re-exports everything so existing imports work unchanged:
16
+ from research import SearchService, CompetitorExtractor, ...
17
+ python research.py batch
 
 
 
 
 
 
 
 
 
 
 
18
  """
19
 
20
+ try:
21
+ # Package context (src/utils/)
22
+ from .config import (
23
+ PROJECT_ROOT, IS_HF_SPACE, RESEARCH_DIR, DATA_DIR,
24
+ SERPAPI_KEY, BRAVE_API_KEY, OPENROUTER_API_KEY,
25
+ LLM_MODEL, LLM_ENABLED,
26
+ COMPETITORS, BATCH_QUERY_TEMPLATES, MARKET_QUERIES,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  )
28
+ from .search import (
29
+ WebSearchResult, SearchBackend, DuckDuckGoBackend,
30
+ SerpAPIBackend, BraveBackend, BACKENDS, get_backend,
31
+ ResultStorage, SearchService,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  )
33
+ from .llm import LLMClient
34
+ from .intel import (
35
+ IntelSection, CompetitorIntelAgent,
36
+ DEEP_INTEL_CATEGORIES, PRIMARY_SOURCE_DOMAINS,
37
+ CATEGORY_EXPECTED, CATEGORY_SYNTHESIS_QUESTIONS,
38
+ )
39
+ from .extract import (
40
+ CompetitorExtractor, COMPANY_DEFINITIONS,
41
+ STATUS_KEYWORDS, DATE_PATTERN, MONEY_PATTERN,
42
+ )
43
+ from .cli import main
44
+ except ImportError:
45
+ # Flat-file context (HF Space: all .py files in same directory)
46
+ from config import ( # type: ignore[no-redef]
47
+ PROJECT_ROOT, IS_HF_SPACE, RESEARCH_DIR, DATA_DIR,
48
+ SERPAPI_KEY, BRAVE_API_KEY, OPENROUTER_API_KEY,
49
+ LLM_MODEL, LLM_ENABLED,
50
+ COMPETITORS, BATCH_QUERY_TEMPLATES, MARKET_QUERIES,
51
+ )
52
+ from search import ( # type: ignore[no-redef]
53
+ WebSearchResult, SearchBackend, DuckDuckGoBackend,
54
+ SerpAPIBackend, BraveBackend, BACKENDS, get_backend,
55
+ ResultStorage, SearchService,
56
+ )
57
+ from llm import LLMClient # type: ignore[no-redef]
58
+ from intel import ( # type: ignore[no-redef]
59
+ IntelSection, CompetitorIntelAgent,
60
+ DEEP_INTEL_CATEGORIES, PRIMARY_SOURCE_DOMAINS,
61
+ CATEGORY_EXPECTED, CATEGORY_SYNTHESIS_QUESTIONS,
62
+ )
63
+ from extract import ( # type: ignore[no-redef]
64
+ CompetitorExtractor, COMPANY_DEFINITIONS,
65
+ STATUS_KEYWORDS, DATE_PATTERN, MONEY_PATTERN,
66
+ )
67
+ from cli import main # type: ignore[no-redef]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  if __name__ == "__main__":
70
  main()
search.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Search backends, result storage, and the SearchService facade.
3
+
4
+ Provides multi-backend web search (DuckDuckGo, SerpAPI, Brave),
5
+ result persistence (JSON + Markdown), and a unified SearchService.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import sys
12
+ import time
13
+ from abc import ABC, abstractmethod
14
+ from dataclasses import dataclass, asdict
15
+ from datetime import datetime, timedelta
16
+ from pathlib import Path
17
+ from typing import Protocol
18
+
19
+ try:
20
+ from .config import RESEARCH_DIR, SERPAPI_KEY, BRAVE_API_KEY
21
+ except ImportError:
22
+ from config import RESEARCH_DIR, SERPAPI_KEY, BRAVE_API_KEY
23
+
24
+
25
+ # ============================================================
26
+ # Data Types
27
+ # ============================================================
28
+
29
+ class SearchResult(Protocol):
30
+ title: str
31
+ url: str
32
+ snippet: str
33
+ source: str
34
+
35
+
36
+ @dataclass
37
+ class WebSearchResult:
38
+ title: str
39
+ url: str
40
+ snippet: str
41
+ source: str
42
+
43
+
44
+ # ============================================================
45
+ # Search Backends
46
+ # ============================================================
47
+
48
+ class SearchBackend(ABC):
49
+ """Abstract base for search backends."""
50
+
51
+ @property
52
+ @abstractmethod
53
+ def name(self) -> str:
54
+ pass
55
+
56
+ @abstractmethod
57
+ def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
58
+ pass
59
+
60
+ @abstractmethod
61
+ def is_available(self) -> bool:
62
+ pass
63
+
64
+
65
+ class DuckDuckGoBackend(SearchBackend):
66
+
67
+ @property
68
+ def name(self) -> str:
69
+ return "duckduckgo"
70
+
71
+ def is_available(self) -> bool:
72
+ try:
73
+ from ddgs import DDGS
74
+ return True
75
+ except ImportError:
76
+ try:
77
+ from duckduckgo_search import DDGS
78
+ return True
79
+ except ImportError:
80
+ return False
81
+
82
+ def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
83
+ try:
84
+ from ddgs import DDGS
85
+ except ImportError:
86
+ from duckduckgo_search import DDGS
87
+
88
+ results = []
89
+ ddgs = DDGS()
90
+ for r in ddgs.text(query, max_results=max_results):
91
+ results.append(WebSearchResult(
92
+ title=r.get("title", ""),
93
+ url=r.get("href", r.get("link", "")),
94
+ snippet=r.get("body", r.get("snippet", "")),
95
+ source=self.name
96
+ ))
97
+ return results
98
+
99
+
100
+ class SerpAPIBackend(SearchBackend):
101
+
102
+ @property
103
+ def name(self) -> str:
104
+ return "serpapi"
105
+
106
+ def is_available(self) -> bool:
107
+ try:
108
+ import requests
109
+ return bool(SERPAPI_KEY)
110
+ except ImportError:
111
+ return False
112
+
113
+ def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
114
+ import requests
115
+
116
+ response = requests.get(
117
+ "https://serpapi.com/search",
118
+ params={"q": query, "api_key": SERPAPI_KEY, "engine": "google", "num": max_results},
119
+ timeout=30
120
+ )
121
+ response.raise_for_status()
122
+ data = response.json()
123
+
124
+ results = []
125
+ for r in data.get("organic_results", [])[:max_results]:
126
+ results.append(WebSearchResult(
127
+ title=r.get("title", ""),
128
+ url=r.get("link", ""),
129
+ snippet=r.get("snippet", ""),
130
+ source=self.name
131
+ ))
132
+ return results
133
+
134
+
135
+ class BraveBackend(SearchBackend):
136
+
137
+ @property
138
+ def name(self) -> str:
139
+ return "brave"
140
+
141
+ def is_available(self) -> bool:
142
+ try:
143
+ import requests
144
+ return bool(BRAVE_API_KEY)
145
+ except ImportError:
146
+ return False
147
+
148
+ def search(self, query: str, max_results: int = 10) -> list[WebSearchResult]:
149
+ import requests
150
+
151
+ response = requests.get(
152
+ "https://api.search.brave.com/res/v1/web/search",
153
+ headers={"Accept": "application/json", "X-Subscription-Token": BRAVE_API_KEY},
154
+ params={"q": query, "count": min(max_results, 20)},
155
+ timeout=30
156
+ )
157
+ response.raise_for_status()
158
+ data = response.json()
159
+
160
+ results = []
161
+ for r in data.get("web", {}).get("results", [])[:max_results]:
162
+ results.append(WebSearchResult(
163
+ title=r.get("title", ""),
164
+ url=r.get("url", ""),
165
+ snippet=r.get("description", ""),
166
+ source=self.name
167
+ ))
168
+ return results
169
+
170
+
171
+ # Backend registry
172
+ BACKENDS: dict[str, SearchBackend] = {
173
+ "duckduckgo": DuckDuckGoBackend(),
174
+ "ddg": DuckDuckGoBackend(),
175
+ "serpapi": SerpAPIBackend(),
176
+ "brave": BraveBackend(),
177
+ }
178
+
179
+
180
+ def get_backend(name: str = "duckduckgo") -> SearchBackend:
181
+ backend = BACKENDS.get(name)
182
+ if not backend:
183
+ raise ValueError(f"Unknown backend: {name}. Available: {list(BACKENDS.keys())}")
184
+ if not backend.is_available():
185
+ raise RuntimeError(f"Backend '{name}' not available. Check dependencies/API keys.")
186
+ return backend
187
+
188
+
189
+ # ============================================================
190
+ # Result Storage
191
+ # ============================================================
192
+
193
+ class ResultStorage:
194
+
195
+ def __init__(self, directory: Path = RESEARCH_DIR):
196
+ self.directory = directory
197
+ self.directory.mkdir(parents=True, exist_ok=True)
198
+
199
+ def save(self, query: str, results: list[WebSearchResult], backend: str) -> tuple[Path, Path]:
200
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
201
+ slug = self._slugify(query)
202
+ base_name = f"{timestamp}_{slug}"
203
+
204
+ data = {
205
+ "query": query,
206
+ "timestamp": datetime.now().isoformat(),
207
+ "backend": backend,
208
+ "result_count": len(results),
209
+ "results": [asdict(r) for r in results]
210
+ }
211
+
212
+ json_path = self.directory / f"{base_name}.json"
213
+ with open(json_path, "w") as f:
214
+ json.dump(data, f, indent=2)
215
+
216
+ md_path = self.directory / f"{base_name}.md"
217
+ with open(md_path, "w") as f:
218
+ f.write(f"# Search: {query}\n\n")
219
+ f.write(f"**Date:** {data['timestamp']} \n")
220
+ f.write(f"**Backend:** {backend} \n")
221
+ f.write(f"**Results:** {len(results)}\n\n---\n")
222
+ for i, r in enumerate(results, 1):
223
+ f.write(f"\n## {i}. {r.title}\n\n**URL:** {r.url}\n\n{r.snippet}\n")
224
+
225
+ return json_path, md_path
226
+
227
+ def list_searches(self, limit: int = 20) -> list[dict]:
228
+ searches = []
229
+ for json_file in sorted(self.directory.glob("*.json"), reverse=True):
230
+ if json_file.name.startswith("."):
231
+ continue
232
+ try:
233
+ with open(json_file) as f:
234
+ data = json.load(f)
235
+ searches.append({
236
+ "file": json_file.name,
237
+ "query": data.get("query", ""),
238
+ "timestamp": data.get("timestamp", ""),
239
+ "results": data.get("result_count", 0)
240
+ })
241
+ except:
242
+ pass
243
+ if len(searches) >= limit:
244
+ break
245
+ return searches
246
+
247
+ def get_recent_queries(self, days: int = 7) -> set[str]:
248
+ cutoff = datetime.now() - timedelta(days=days)
249
+ recent = set()
250
+
251
+ for json_file in self.directory.glob("*.json"):
252
+ if json_file.name.startswith("."):
253
+ continue
254
+ try:
255
+ with open(json_file) as f:
256
+ data = json.load(f)
257
+ ts = data.get("timestamp", "")
258
+ if ts:
259
+ file_date = datetime.fromisoformat(ts.replace("Z", "+00:00").split("+")[0])
260
+ if file_date >= cutoff:
261
+ query = data.get("query", "").lower().strip()
262
+ recent.add(query)
263
+ except:
264
+ pass
265
+ return recent
266
+
267
+ def _slugify(self, text: str, max_len: int = 50) -> str:
268
+ slug = text.lower()[:max_len].replace(" ", "-").replace("/", "-")
269
+ return "".join(c for c in slug if c.isalnum() or c == "-")
270
+
271
+
272
+ # ============================================================
273
+ # Search Service (Facade)
274
+ # ============================================================
275
+
276
+ class SearchService:
277
+ """High-level search interface combining backend and storage."""
278
+
279
+ def __init__(self, backend: str = "duckduckgo", storage: ResultStorage = None):
280
+ self.backend = get_backend(backend)
281
+ self.storage = storage or ResultStorage()
282
+
283
+ def search(self, query: str, max_results: int = 10, save: bool = True) -> list[WebSearchResult]:
284
+ results = self.backend.search(query, max_results)
285
+ if save and results:
286
+ self.storage.save(query, results, self.backend.name)
287
+ return results
288
+
289
+ def search_batch(self, queries: list[str], max_results: int = 10,
290
+ delay: float = 0.5, callback=None) -> dict[str, int]:
291
+ stats = {}
292
+ for i, query in enumerate(queries, 1):
293
+ if callback:
294
+ callback(i, len(queries), query)
295
+ try:
296
+ results = self.search(query, max_results, save=True)
297
+ stats[query] = len(results)
298
+ except Exception as e:
299
+ stats[query] = -1
300
+ print(f"Error on '{query}': {e}", file=sys.stderr)
301
+
302
+ if delay > 0 and i < len(queries):
303
+ time.sleep(delay)
304
+
305
+ return stats
sota_agent.py ADDED
@@ -0,0 +1,850 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GURMA.ai SOTA Technology Knowledge Agent
4
+
5
+ Maintains a persistent knowledge base of state-of-the-art models, techniques,
6
+ and tools relevant to GURMA.ai's high-precision medical/rehabilitation AI domain.
7
+
8
+ The agent is "aware" of GURMA.ai's strategic position (outcome data moat,
9
+ edge-first, safety-critical) and filters all technology developments through
10
+ that lens. It updates itself when fed new information (papers, podcasts,
11
+ announcements).
12
+
13
+ Usage:
14
+ python research.py sota # Show current knowledge base
15
+ python research.py sota --analyze notes/research/podcast-sota-models.md
16
+ python research.py sota --show models # Show tracked models
17
+ python research.py sota --show techniques # Show tracked techniques
18
+ python research.py sota --show stack # Show recommended tech stack
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import json
24
+ import re
25
+ from dataclasses import dataclass, field, asdict
26
+ from datetime import datetime
27
+ from pathlib import Path
28
+ from typing import Optional
29
+
30
+ try:
31
+ from .llm import LLMClient
32
+ from .config import RESEARCH_DIR, LLM_ENABLED
33
+ except ImportError:
34
+ from llm import LLMClient
35
+ from config import RESEARCH_DIR, LLM_ENABLED
36
+
37
+
38
+ # ============================================================
39
+ # GURMA.ai Context — what the agent "knows" about the company
40
+ # ============================================================
41
+
42
+ GURMA_CONTEXT = {
43
+ "company": "GURMA.ai AG (Swiss)",
44
+ "domain": "Rehabilitation robotics AI — high-precision medical domain",
45
+ "data_moat": "15 years of patient outcome data from BAMA Teknoloji "
46
+ "(gait dynamics, EMG signals, recovery outcomes — not just motion data)",
47
+ "products": ["RoboGate (stationary gait rehab robot)", "FreeGate (5-axis mobile exoskeleton)"],
48
+ "architecture": "Privacy-first edge computing — no cloud data exposure",
49
+ "regulatory": "EU AI Act (high-risk), MDR, ISO 13485, GDPR/KVKK — 80% safety focus from day one",
50
+ "precision_requirement": (
51
+ "Medical rehabilitation demands super-high precision: wrong therapy parameters "
52
+ "can harm patients. Models must be verifiable, explainable, and fail-safe. "
53
+ "This is NOT a domain where 'good enough' works — it requires domain-specific "
54
+ "training on real outcome data with verifiable reward signals."
55
+ ),
56
+ "core_thesis": (
57
+ "Proprietary outcome data + domain expertise + regulatory focus = defensible AI moat. "
58
+ "Frontier labs are NOT focusing on domain-specific medical applications."
59
+ ),
60
+ }
61
+
62
+ # ============================================================
63
+ # Relevance Scoring — how the agent filters incoming info
64
+ # ============================================================
65
+
66
+ GURMA_RELEVANCE_SIGNALS = {
67
+ "outcome_data": {
68
+ "weight": 3,
69
+ "description": "Patient outcomes as training signal / verifiable rewards",
70
+ "patterns": [
71
+ r"outcome.?data", r"patient.?outcome", r"recovery.?outcome",
72
+ r"treatment.?outcome", r"verifiable.?reward", r"reward.?model",
73
+ r"clinical.?outcome", r"reward.?signal",
74
+ ],
75
+ },
76
+ "rehabilitation": {
77
+ "weight": 3,
78
+ "description": "Rehabilitation, gait, exoskeleton, motor recovery tech",
79
+ "patterns": [
80
+ r"rehabilitat", r"gait.?(?:analysis|dynamic|training)",
81
+ r"exoskeleton", r"physical.?therapy", r"motor.?recovery",
82
+ r"neurorehab", r"stroke.?recovery",
83
+ ],
84
+ },
85
+ "high_precision": {
86
+ "weight": 3,
87
+ "description": "High-precision / safety-critical model requirements",
88
+ "patterns": [
89
+ r"high.?precision", r"safety.?critical", r"fail.?(?:safe|never)",
90
+ r"verification.?layer", r"verifiable", r"explainabl",
91
+ r"clinical.?(?:accuracy|precision|validation)",
92
+ ],
93
+ },
94
+ "domain_specific": {
95
+ "weight": 2,
96
+ "description": "Domain-specific fine-tuning — GURMA.ai's core approach",
97
+ "patterns": [
98
+ r"domain.specific", r"fine.tun", r"\bLoRA\b", r"specialized.?model",
99
+ r"medical.?(?:model|AI|LLM)", r"clinical.?(?:NLP|model)",
100
+ r"proprietary.?data",
101
+ ],
102
+ },
103
+ "rl_training": {
104
+ "weight": 2,
105
+ "description": "RL post-training — outcome data as rewards",
106
+ "patterns": [
107
+ r"\bRLHF\b", r"\bRLVR\b", r"reinforcement.?learning",
108
+ r"post.training", r"\bPPO\b", r"\bGRPO\b", r"reward.?shaping",
109
+ ],
110
+ },
111
+ "edge_privacy": {
112
+ "weight": 2,
113
+ "description": "Edge inference, on-device, privacy-first deployment",
114
+ "patterns": [
115
+ r"edge.?(?:computing|deploy|inference|device)",
116
+ r"on.device", r"privacy.first", r"local.?inference",
117
+ r"quantiz", r"on.premise",
118
+ ],
119
+ },
120
+ "safety_regulatory": {
121
+ "weight": 2,
122
+ "description": "AI safety, medical device regulation, verification",
123
+ "patterns": [
124
+ r"AI.?safety", r"medical.?device", r"(?:EU|FDA).?(?:AI|regulat)",
125
+ r"\bMDR\b", r"CE.?mark", r"ISO.?13485", r"verification.?layer",
126
+ r"constitutional.?AI",
127
+ ],
128
+ },
129
+ "robotics": {
130
+ "weight": 2,
131
+ "description": "Robotics AI, world models, embodied AI, patient simulation",
132
+ "patterns": [
133
+ r"robot(?:ic)?s?.?(?:AI|control|learning)",
134
+ r"world.?model", r"embodied.?AI", r"simulation.?model",
135
+ r"continual.?learning",
136
+ ],
137
+ },
138
+ "open_models": {
139
+ "weight": 1,
140
+ "description": "Open-weight models with clear licensing for medical use",
141
+ "patterns": [
142
+ r"open.weight", r"open.source.?(?:model|LLM)",
143
+ r"\bQwen\b", r"\bOLMo\b", r"DeepSeek", r"\bLlama\b",
144
+ r"\bMistral\b", r"gpt.oss",
145
+ ],
146
+ },
147
+ "tool_use": {
148
+ "weight": 1,
149
+ "description": "Tool-calling AI for clinical workflow automation",
150
+ "patterns": [
151
+ r"tool.?(?:use|calling)", r"function.?call",
152
+ r"(?:AI|LLM).?agent", r"autonomous.?agent",
153
+ ],
154
+ },
155
+ }
156
+
157
+ # ============================================================
158
+ # Initial Knowledge Base — seeded from podcast analysis
159
+ # ============================================================
160
+
161
+ INITIAL_KNOWLEDGE_BASE = {
162
+ "models": [
163
+ {
164
+ "name": "Qwen 3",
165
+ "params": "7B-32B range",
166
+ "why": "Best open-weight performance (50T tokens trained), friendly commercial license",
167
+ "gurma_fit": "Base model for domain fine-tuning; fewer restrictions than Llama for medical device use",
168
+ "status": "recommended",
169
+ "added": "2026-02-06",
170
+ "source": "Lex Fridman Podcast #490",
171
+ },
172
+ {
173
+ "name": "OLMo 3",
174
+ "params": "7B+",
175
+ "why": "Fully documented training process, truly open (AI2), great for understanding methodology",
176
+ "gurma_fit": "Best for learning/reproducing training; full transparency aids regulatory documentation",
177
+ "status": "recommended",
178
+ "added": "2026-02-06",
179
+ "source": "Lex Fridman Podcast #490",
180
+ },
181
+ {
182
+ "name": "gpt-oss-120b",
183
+ "params": "120B",
184
+ "why": "First open model specifically trained with tool use in mind",
185
+ "gurma_fit": "Tool-calling for patient data APIs, sensor queries, automated insurance reporting",
186
+ "status": "watch",
187
+ "added": "2026-02-06",
188
+ "source": "Lex Fridman Podcast #490",
189
+ },
190
+ {
191
+ "name": "DeepSeek-V3.2",
192
+ "params": "varies",
193
+ "why": "Sparse attention architecture, efficient inference",
194
+ "gurma_fit": "Sparse attention promising for edge deployment on RoboGate/FreeGate",
195
+ "status": "watch",
196
+ "added": "2026-02-06",
197
+ "source": "Lex Fridman Podcast #490",
198
+ },
199
+ ],
200
+ "techniques": [
201
+ {
202
+ "name": "RLVR (Reinforcement Learning with Verifiable Rewards)",
203
+ "category": "post-training",
204
+ "why": "Post-training is the 'skill unlock' — pre-training gives knowledge, post-training gives precision",
205
+ "gurma_fit": "Patient recovery outcomes ARE verifiable rewards. 15 years of outcome data = perfect RLVR signal.",
206
+ "priority": "high",
207
+ "added": "2026-02-06",
208
+ "source": "Lex Fridman Podcast #490",
209
+ },
210
+ {
211
+ "name": "LoRA (Low-Rank Adaptation)",
212
+ "category": "fine-tuning",
213
+ "why": "Fine-tune only a small subset of weights; practical on limited compute; proven on 7B models",
214
+ "gurma_fit": "Start with 7B models + LoRA for engagement scoring and outcome prediction. Efficient enough for iterative experiments.",
215
+ "priority": "high",
216
+ "added": "2026-02-06",
217
+ "source": "Lex Fridman Podcast #490",
218
+ },
219
+ {
220
+ "name": "PPO / GRPO",
221
+ "category": "post-training",
222
+ "why": "Policy gradient algorithms for RL post-training; PPO is standard, GRPO is newer group-relative approach",
223
+ "gurma_fit": "Algorithms to train models using outcome data as reward signal",
224
+ "priority": "medium",
225
+ "added": "2026-02-06",
226
+ "source": "Lex Fridman Podcast #490",
227
+ },
228
+ {
229
+ "name": "Synthetic Data (reformatting)",
230
+ "category": "data-processing",
231
+ "why": "Not 'AI-generated fake data' — means restructuring real data into training formats (Q&A, summaries). OCR for medical PDFs.",
232
+ "gurma_fit": "Convert EMG readings → Q&A format, session notes → case summaries, treatment logs → outcome predictions",
233
+ "priority": "high",
234
+ "added": "2026-02-06",
235
+ "source": "Lex Fridman Podcast #490",
236
+ },
237
+ {
238
+ "name": "World Models",
239
+ "category": "simulation",
240
+ "why": "Model runs a simulation of the environment; verifies intermediate states, not just final results",
241
+ "gurma_fit": "Patient progress modeling IS a world model problem. Verify intermediate therapy states, simulate treatment outcomes.",
242
+ "priority": "medium",
243
+ "added": "2026-02-06",
244
+ "source": "Lex Fridman Podcast #490",
245
+ },
246
+ {
247
+ "name": "Sparse Attention",
248
+ "category": "efficiency",
249
+ "why": "Lightweight token selection indexer; efficient inference for edge deployment",
250
+ "gurma_fit": "Could enable on-device models for RoboGate/FreeGate with privacy-first architecture",
251
+ "priority": "medium",
252
+ "added": "2026-02-06",
253
+ "source": "Lex Fridman Podcast #490",
254
+ },
255
+ ],
256
+ "tech_stack": [
257
+ {"component": "Base Model", "recommendation": "Qwen 3 (7B-32B) or OLMo 3", "rationale": "Open weights, good license, well-documented"},
258
+ {"component": "Fine-tuning", "recommendation": "LoRA + RLVR", "rationale": "Practical compute, outcome-based rewards"},
259
+ {"component": "Tool Use Model", "recommendation": "gpt-oss-120b", "rationale": "Specifically trained for tool calling"},
260
+ {"component": "Training Framework", "recommendation": "TRL (Hugging Face)", "rationale": "RLHF/RLVR implementation"},
261
+ {"component": "Inference", "recommendation": "vLLM or SGLang", "rationale": "Production-ready, NOT HF Transformers in prod"},
262
+ {"component": "Edge", "recommendation": "Quantized models + sparse attention", "rationale": "Privacy-first deployment"},
263
+ {"component": "Safety", "recommendation": "Constitutional AI principles + verification layers", "rationale": "Medical device compliance, 'allowed to fail never'"},
264
+ ],
265
+ "key_principles": [
266
+ {
267
+ "principle": "Domain-specific data is the moat",
268
+ "detail": "Frontier labs won't build rehab-specific models. Proprietary outcome data that nobody else can access is the defensible advantage.",
269
+ "source": "Lex Fridman Podcast #490 — Sebastian Raschka",
270
+ },
271
+ {
272
+ "principle": "Post-training over pre-training",
273
+ "detail": "Don't spend on pre-training. Use open base models + invest in post-training (RLVR) where outcome data becomes the competitive edge.",
274
+ "source": "Lex Fridman Podcast #490",
275
+ },
276
+ {
277
+ "principle": "Data quality over quantity",
278
+ "detail": "Curate aggressively. Reformat existing data into multiple training formats. Clean > big.",
279
+ "source": "Lex Fridman Podcast #490 — Nathan Lambert",
280
+ },
281
+ {
282
+ "principle": "Human verification mandatory for medical AI",
283
+ "detail": "Tool-calling and autonomous agents still require human-in-the-loop for trust/safety in clinical context.",
284
+ "source": "Lex Fridman Podcast #490",
285
+ },
286
+ {
287
+ "principle": "High precision is non-negotiable",
288
+ "detail": "Medical rehab is a 'fail never' domain. Models must be verifiable, with intermediate state checking (world model approach).",
289
+ "source": "Lex Fridman Podcast #490 — Lex Fridman",
290
+ },
291
+ ],
292
+ "sources_analyzed": [
293
+ {
294
+ "name": "Lex Fridman Podcast #490 — State of AI in 2026",
295
+ "type": "podcast",
296
+ "date": "2026-02-06",
297
+ "key_speakers": "Nathan Lambert (AI2), Sebastian Raschka",
298
+ "insights_extracted": 10,
299
+ },
300
+ ],
301
+ "action_items": [
302
+ {"item": "Experiment with Qwen 3 / OLMo 3 on rehabilitation domain prompts", "status": "pending"},
303
+ {"item": "Structure outcome data for RLVR — create verifiable reward functions (gait improvement score, session completion rate)", "status": "pending"},
304
+ {"item": "Build tool schemas — APIs for patient data access, sensor queries, report generation", "status": "pending"},
305
+ {"item": "Set up vLLM for production inference", "status": "pending"},
306
+ {"item": "Study Nathan Lambert's RLHF book when released", "status": "pending"},
307
+ ],
308
+ "metadata": {
309
+ "created": "2026-02-06",
310
+ "last_updated": "2026-02-06",
311
+ "version": 1,
312
+ },
313
+ }
314
+
315
+
316
+ # ============================================================
317
+ # SOTA Knowledge Agent
318
+ # ============================================================
319
+
320
+ class SOTAScoutAgent:
321
+ """Maintains and updates GURMA.ai's SOTA technology knowledge base.
322
+
323
+ The agent understands that GURMA.ai operates in a high-precision medical
324
+ domain where model accuracy, verifiability, and safety are non-negotiable.
325
+ It filters all technology developments through this lens.
326
+
327
+ Usage:
328
+ agent = SOTAScoutAgent()
329
+ agent.show() # Print current KB state
330
+ agent.show("models") # Show tracked models
331
+ agent.analyze("notes/research/podcast.md") # Analyze + update KB
332
+ """
333
+
334
+ def __init__(self, llm: LLMClient = None):
335
+ self.llm = llm or LLMClient()
336
+ self.kb_dir = RESEARCH_DIR / "sota"
337
+ self.kb_dir.mkdir(parents=True, exist_ok=True)
338
+ self.kb_path = self.kb_dir / "knowledge_base.json"
339
+ self.kb = self._load_kb()
340
+
341
+ # ----------------------------------------------------------
342
+ # Persistence
343
+ # ----------------------------------------------------------
344
+
345
+ def _load_kb(self) -> dict:
346
+ """Load existing KB or initialize from seed."""
347
+ if self.kb_path.exists():
348
+ try:
349
+ with open(self.kb_path) as f:
350
+ return json.load(f)
351
+ except Exception:
352
+ pass
353
+ # First run — seed from initial knowledge
354
+ kb = json.loads(json.dumps(INITIAL_KNOWLEDGE_BASE))
355
+ self._save_kb(kb)
356
+ return kb
357
+
358
+ def _save_kb(self, kb: dict = None):
359
+ """Persist knowledge base to disk."""
360
+ kb = kb or self.kb
361
+ kb["metadata"]["last_updated"] = datetime.now().strftime("%Y-%m-%d")
362
+ with open(self.kb_path, "w") as f:
363
+ json.dump(kb, f, indent=2)
364
+
365
+ # ----------------------------------------------------------
366
+ # Public: Show
367
+ # ----------------------------------------------------------
368
+
369
+ def show(self, section: str = None) -> dict:
370
+ """Display current knowledge base state.
371
+
372
+ Args:
373
+ section: Optional — "models", "techniques", "stack", "principles",
374
+ "actions", "sources". None = summary of everything.
375
+
376
+ Returns: The KB data (also prints to stdout).
377
+ """
378
+ if section == "models":
379
+ self._print_models()
380
+ elif section == "techniques":
381
+ self._print_techniques()
382
+ elif section == "stack":
383
+ self._print_stack()
384
+ elif section == "principles":
385
+ self._print_principles()
386
+ elif section == "actions":
387
+ self._print_actions()
388
+ elif section == "sources":
389
+ self._print_sources()
390
+ else:
391
+ self._print_summary()
392
+
393
+ return self.kb
394
+
395
+ def _print_summary(self):
396
+ meta = self.kb.get("metadata", {})
397
+ models = self.kb.get("models", [])
398
+ techniques = self.kb.get("techniques", [])
399
+ stack = self.kb.get("tech_stack", [])
400
+ principles = self.kb.get("key_principles", [])
401
+ actions = self.kb.get("action_items", [])
402
+ sources = self.kb.get("sources_analyzed", [])
403
+
404
+ print(f"\n{'='*60}")
405
+ print(f"GURMA.ai SOTA Knowledge Base")
406
+ print(f"{'='*60}")
407
+ print(f"Last updated: {meta.get('last_updated', 'unknown')}")
408
+ print(f"Version: {meta.get('version', 0)}")
409
+ print(f"")
410
+ print(f" Models tracked: {len(models)}")
411
+ print(f" Techniques tracked: {len(techniques)}")
412
+ print(f" Tech stack items: {len(stack)}")
413
+ print(f" Key principles: {len(principles)}")
414
+ print(f" Action items: {len(actions)} ({sum(1 for a in actions if a.get('status') == 'pending')} pending)")
415
+ print(f" Sources analyzed: {len(sources)}")
416
+ print(f"")
417
+
418
+ rec_models = [m for m in models if m.get("status") == "recommended"]
419
+ if rec_models:
420
+ print(f"Recommended models:")
421
+ for m in rec_models:
422
+ print(f" * {m['name']} ({m.get('params', '?')}) — {m.get('gurma_fit', '')[:80]}")
423
+
424
+ high_tech = [t for t in techniques if t.get("priority") == "high"]
425
+ if high_tech:
426
+ print(f"\nHigh-priority techniques:")
427
+ for t in high_tech:
428
+ print(f" * {t['name']} — {t.get('gurma_fit', '')[:80]}")
429
+
430
+ pending = [a for a in actions if a.get("status") == "pending"]
431
+ if pending:
432
+ print(f"\nPending action items:")
433
+ for a in pending[:5]:
434
+ print(f" [ ] {a['item']}")
435
+
436
+ print(f"\n{'='*60}")
437
+
438
+ def _print_models(self):
439
+ print(f"\n--- Tracked Models ---\n")
440
+ for m in self.kb.get("models", []):
441
+ status_icon = {"recommended": "*", "watch": "~", "deprecated": "x"}.get(m.get("status", ""), "?")
442
+ print(f" [{status_icon}] {m['name']} ({m.get('params', '?')})")
443
+ print(f" Why: {m.get('why', '')}")
444
+ print(f" GURMA.ai fit: {m.get('gurma_fit', '')}")
445
+ print(f" Source: {m.get('source', '')} | Added: {m.get('added', '')}")
446
+ print()
447
+
448
+ def _print_techniques(self):
449
+ print(f"\n--- Tracked Techniques ---\n")
450
+ for t in self.kb.get("techniques", []):
451
+ pri = {"high": "!!!", "medium": "!!", "low": "!"}.get(t.get("priority", ""), "?")
452
+ print(f" [{pri}] {t['name']} ({t.get('category', '')})")
453
+ print(f" Why: {t.get('why', '')}")
454
+ print(f" GURMA.ai fit: {t.get('gurma_fit', '')}")
455
+ print()
456
+
457
+ def _print_stack(self):
458
+ print(f"\n--- Recommended Tech Stack ---\n")
459
+ for s in self.kb.get("tech_stack", []):
460
+ print(f" {s['component']:20s} -> {s['recommendation']}")
461
+ print(f" {'':20s} {s.get('rationale', '')}")
462
+ print()
463
+
464
+ def _print_principles(self):
465
+ print(f"\n--- Key Principles ---\n")
466
+ for p in self.kb.get("key_principles", []):
467
+ print(f" * {p['principle']}")
468
+ print(f" {p.get('detail', '')}")
469
+ print()
470
+
471
+ def _print_actions(self):
472
+ print(f"\n--- Action Items ---\n")
473
+ for a in self.kb.get("action_items", []):
474
+ icon = "[x]" if a.get("status") == "done" else "[ ]"
475
+ print(f" {icon} {a['item']}")
476
+
477
+ def _print_sources(self):
478
+ print(f"\n--- Analyzed Sources ---\n")
479
+ for s in self.kb.get("sources_analyzed", []):
480
+ print(f" {s.get('date', '?')} | {s['name']} ({s.get('type', '')})")
481
+ if s.get("key_speakers"):
482
+ print(f" Speakers: {s['key_speakers']}")
483
+ print(f" Insights extracted: {s.get('insights_extracted', 0)}")
484
+ print()
485
+
486
+ # ----------------------------------------------------------
487
+ # Public: Analyze document and update KB
488
+ # ----------------------------------------------------------
489
+
490
+ def analyze(self, file_path: str) -> Path:
491
+ """Analyze a document for GURMA.ai-relevant SOTA insights and update KB.
492
+
493
+ Reads the file, scores sections for relevance, uses LLM to extract
494
+ structured insights, and merges new findings into the knowledge base.
495
+
496
+ Args:
497
+ file_path: Path to markdown/text file
498
+
499
+ Returns: Path to generated analysis report
500
+ """
501
+ try:
502
+ from .config import PROJECT_ROOT
503
+ except ImportError:
504
+ from config import PROJECT_ROOT
505
+
506
+ path = Path(file_path)
507
+ if not path.is_absolute():
508
+ path = PROJECT_ROOT / file_path
509
+
510
+ if not path.exists():
511
+ raise FileNotFoundError(f"File not found: {path}")
512
+
513
+ print(f"\n{'='*60}")
514
+ print(f"Analyzing: {path.name}")
515
+ print(f"{'='*60}\n")
516
+
517
+ text = path.read_text(encoding="utf-8")
518
+
519
+ # Score sections for relevance
520
+ sections = self._split_sections(text)
521
+ scored = []
522
+ for sec in sections:
523
+ if len(sec.strip()) < 50:
524
+ continue
525
+ score, tags = self._score_relevance(sec)
526
+ if score > 0:
527
+ scored.append({"text": sec.strip()[:500], "score": score, "tags": tags})
528
+ scored.sort(key=lambda x: -x["score"])
529
+
530
+ print(f"Sections: {len(sections)} total, {len(scored)} relevant")
531
+
532
+ # LLM extraction — structured insights for KB update
533
+ llm_update = None
534
+ if self.llm.enabled:
535
+ print("[LLM] Extracting structured insights...")
536
+ llm_update = self._extract_kb_updates(text, path.name)
537
+ if llm_update:
538
+ n_models = len(llm_update.get("new_models", []))
539
+ n_tech = len(llm_update.get("new_techniques", []))
540
+ n_actions = len(llm_update.get("new_action_items", []))
541
+ print(f"[LLM] Found: {n_models} models, {n_tech} techniques, {n_actions} action items")
542
+ else:
543
+ print("[INFO] LLM not available — relevance scoring only, no KB update")
544
+
545
+ # Merge into knowledge base
546
+ changes = self._merge_updates(llm_update, path.name, len(scored))
547
+
548
+ # Generate report
549
+ report_path = self._generate_report(path.name, scored, llm_update, changes)
550
+
551
+ # Save updated KB
552
+ self._save_kb()
553
+
554
+ print(f"\nKB updated: {changes}")
555
+ print(f"Report: {report_path}")
556
+ return report_path
557
+
558
+ # ----------------------------------------------------------
559
+ # Internal: Relevance scoring
560
+ # ----------------------------------------------------------
561
+
562
+ def _score_relevance(self, text: str) -> tuple[float, list[str]]:
563
+ """Score text against GURMA.ai's high-precision domain themes.
564
+
565
+ Returns (score 0.0-1.0, matched signal tags).
566
+ """
567
+ text_lower = text.lower()
568
+ total_weight = 0
569
+ max_possible = sum(s["weight"] for s in GURMA_RELEVANCE_SIGNALS.values())
570
+ matched_tags = []
571
+
572
+ for tag, signal in GURMA_RELEVANCE_SIGNALS.items():
573
+ for pattern in signal["patterns"]:
574
+ if re.search(pattern, text_lower, re.IGNORECASE):
575
+ total_weight += signal["weight"]
576
+ matched_tags.append(tag)
577
+ break
578
+
579
+ score = min(total_weight / max_possible, 1.0) if max_possible > 0 else 0.0
580
+ return round(score, 3), matched_tags
581
+
582
+ def _split_sections(self, text: str) -> list[str]:
583
+ """Split markdown into logical sections."""
584
+ sections = re.split(r'\n#{1,3}\s+', text)
585
+ return [s.strip() for s in sections if s.strip()]
586
+
587
+ # ----------------------------------------------------------
588
+ # Internal: LLM extraction for KB update
589
+ # ----------------------------------------------------------
590
+
591
+ def _extract_kb_updates(self, text: str, filename: str) -> Optional[dict]:
592
+ """Use LLM to extract structured KB updates from a document.
593
+
594
+ Returns dict with new_models, new_techniques, stack_updates,
595
+ new_principles, new_action_items, and strategic_note.
596
+ """
597
+ truncated = text[:12000]
598
+
599
+ # Include current KB state so LLM can detect what's truly new
600
+ current_models = ", ".join(m["name"] for m in self.kb.get("models", []))
601
+ current_techniques = ", ".join(t["name"] for t in self.kb.get("techniques", []))
602
+
603
+ system = (
604
+ "You are the technology intelligence agent for GURMA.ai, a Swiss AI company "
605
+ "building high-precision models for rehabilitation robotics. "
606
+ "GURMA.ai has 15 years of patient outcome data (gait dynamics, EMG, recovery outcomes) "
607
+ "from BAMA Teknoloji. Their domain requires SUPER-HIGH PRECISION — "
608
+ "wrong therapy parameters can harm patients. "
609
+ "Architecture: privacy-first edge computing. "
610
+ "Regulatory: EU AI Act (high-risk), MDR, ISO 13485. "
611
+ "Core thesis: proprietary outcome data + domain expertise + safety focus = defensible moat. "
612
+ "Your job: extract technology insights that help GURMA.ai build better, "
613
+ "safer, more precise models for this domain."
614
+ )
615
+
616
+ prompt = f"""Analyze this document and extract NEW technology insights for GURMA.ai's knowledge base.
617
+
618
+ Document: {filename}
619
+ ---
620
+ {truncated}
621
+ ---
622
+
623
+ Already tracked models: {current_models}
624
+ Already tracked techniques: {current_techniques}
625
+
626
+ Extract ONLY information that is NEW or updates existing knowledge.
627
+ Focus on what matters for a high-precision medical AI domain.
628
+
629
+ Return JSON:
630
+ {{
631
+ "new_models": [
632
+ {{
633
+ "name": "Model name",
634
+ "params": "Size/params",
635
+ "why": "Why it matters in general",
636
+ "gurma_fit": "Specific fit for GURMA.ai's high-precision rehab domain",
637
+ "status": "recommended|watch|deprecated"
638
+ }}
639
+ ],
640
+ "new_techniques": [
641
+ {{
642
+ "name": "Technique name",
643
+ "category": "post-training|fine-tuning|data-processing|efficiency|simulation|safety",
644
+ "why": "Why it matters",
645
+ "gurma_fit": "How GURMA.ai should use it for high-precision medical AI",
646
+ "priority": "high|medium|low"
647
+ }}
648
+ ],
649
+ "stack_updates": [
650
+ {{
651
+ "component": "Which tech stack component to update",
652
+ "recommendation": "New recommendation",
653
+ "rationale": "Why this change"
654
+ }}
655
+ ],
656
+ "new_principles": [
657
+ {{
658
+ "principle": "Short principle statement",
659
+ "detail": "Explanation and evidence"
660
+ }}
661
+ ],
662
+ "new_action_items": [
663
+ "Concrete next step for GURMA.ai"
664
+ ],
665
+ "strategic_note": "How this document affects GURMA.ai's strategy (1-2 sentences, or null if no change)"
666
+ }}
667
+
668
+ Rules:
669
+ - ONLY include genuinely new information not already in the tracked lists
670
+ - Every item must connect to GURMA.ai's HIGH-PRECISION medical domain
671
+ - If a model/technique is already tracked, skip it (don't duplicate)
672
+ - If existing knowledge should be UPDATED (e.g. new version), include it with the update
673
+ - Be specific: name versions, papers, benchmarks
674
+ - Empty arrays are fine if nothing new is found"""
675
+
676
+ response = self.llm.call(prompt, system, max_tokens=3000)
677
+ if response:
678
+ match = re.search(r'\{.*\}', response, re.DOTALL)
679
+ if match:
680
+ try:
681
+ return json.loads(match.group())
682
+ except Exception:
683
+ pass
684
+ return None
685
+
686
+ # ----------------------------------------------------------
687
+ # Internal: Merge updates into KB
688
+ # ----------------------------------------------------------
689
+
690
+ def _merge_updates(self, llm_update: Optional[dict], source_name: str,
691
+ insights_count: int) -> dict:
692
+ """Merge LLM-extracted updates into the knowledge base.
693
+
694
+ Returns summary of changes made.
695
+ """
696
+ changes = {"models_added": 0, "techniques_added": 0,
697
+ "stack_updated": 0, "principles_added": 0,
698
+ "actions_added": 0}
699
+
700
+ # Record source
701
+ self.kb.setdefault("sources_analyzed", []).append({
702
+ "name": source_name,
703
+ "type": "document",
704
+ "date": datetime.now().strftime("%Y-%m-%d"),
705
+ "insights_extracted": insights_count,
706
+ })
707
+
708
+ if not llm_update:
709
+ return changes
710
+
711
+ today = datetime.now().strftime("%Y-%m-%d")
712
+
713
+ # Merge models
714
+ existing_names = {m["name"].lower() for m in self.kb.get("models", [])}
715
+ for m in llm_update.get("new_models", []):
716
+ if m.get("name", "").lower() not in existing_names:
717
+ m["added"] = today
718
+ m["source"] = source_name
719
+ self.kb["models"].append(m)
720
+ changes["models_added"] += 1
721
+
722
+ # Merge techniques
723
+ existing_tech = {t["name"].lower() for t in self.kb.get("techniques", [])}
724
+ for t in llm_update.get("new_techniques", []):
725
+ if t.get("name", "").lower() not in existing_tech:
726
+ t["added"] = today
727
+ t["source"] = source_name
728
+ self.kb["techniques"].append(t)
729
+ changes["techniques_added"] += 1
730
+
731
+ # Stack updates — replace matching components
732
+ for su in llm_update.get("stack_updates", []):
733
+ component = su.get("component", "")
734
+ updated = False
735
+ for i, existing in enumerate(self.kb.get("tech_stack", [])):
736
+ if existing["component"].lower() == component.lower():
737
+ self.kb["tech_stack"][i] = su
738
+ updated = True
739
+ changes["stack_updated"] += 1
740
+ break
741
+ if not updated and component:
742
+ self.kb["tech_stack"].append(su)
743
+ changes["stack_updated"] += 1
744
+
745
+ # Merge principles
746
+ existing_principles = {p["principle"].lower() for p in self.kb.get("key_principles", [])}
747
+ for p in llm_update.get("new_principles", []):
748
+ if p.get("principle", "").lower() not in existing_principles:
749
+ p["source"] = source_name
750
+ self.kb["key_principles"].append(p)
751
+ changes["principles_added"] += 1
752
+
753
+ # Merge action items
754
+ existing_actions = {a["item"].lower() for a in self.kb.get("action_items", [])}
755
+ for ai in llm_update.get("new_action_items", []):
756
+ if ai.lower() not in existing_actions:
757
+ self.kb["action_items"].append({"item": ai, "status": "pending"})
758
+ changes["actions_added"] += 1
759
+
760
+ # Bump version
761
+ self.kb["metadata"]["version"] = self.kb["metadata"].get("version", 0) + 1
762
+
763
+ return changes
764
+
765
+ # ----------------------------------------------------------
766
+ # Internal: Report generation
767
+ # ----------------------------------------------------------
768
+
769
+ def _generate_report(self, filename: str, scored_sections: list[dict],
770
+ llm_update: Optional[dict], changes: dict) -> Path:
771
+ """Generate analysis report as markdown."""
772
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
773
+ slug = re.sub(r'[^a-z0-9]', '-', filename.lower().rsplit('.', 1)[0])[:40]
774
+ report_path = self.kb_dir / f"{timestamp}_{slug}_analysis.md"
775
+
776
+ lines = [
777
+ f"# SOTA Analysis: {filename}",
778
+ "",
779
+ f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')} ",
780
+ f"**Source:** {filename} ",
781
+ f"**Relevant sections:** {len(scored_sections)} ",
782
+ f"**KB version:** {self.kb['metadata'].get('version', '?')} ",
783
+ "",
784
+ ]
785
+
786
+ # Changes summary
787
+ total_changes = sum(changes.values())
788
+ if total_changes > 0:
789
+ lines.append("## Knowledge Base Updates")
790
+ lines.append("")
791
+ if changes["models_added"]:
792
+ lines.append(f"- **{changes['models_added']}** new model(s) added")
793
+ if changes["techniques_added"]:
794
+ lines.append(f"- **{changes['techniques_added']}** new technique(s) added")
795
+ if changes["stack_updated"]:
796
+ lines.append(f"- **{changes['stack_updated']}** tech stack update(s)")
797
+ if changes["principles_added"]:
798
+ lines.append(f"- **{changes['principles_added']}** new principle(s)")
799
+ if changes["actions_added"]:
800
+ lines.append(f"- **{changes['actions_added']}** new action item(s)")
801
+ lines.append("")
802
+ else:
803
+ lines.append("*No new knowledge extracted (document may cover already-tracked topics).*")
804
+ lines.append("")
805
+
806
+ # LLM-extracted details
807
+ if llm_update:
808
+ if llm_update.get("strategic_note"):
809
+ lines.append("## Strategic Note")
810
+ lines.append(llm_update["strategic_note"])
811
+ lines.append("")
812
+
813
+ for m in llm_update.get("new_models", []):
814
+ lines.append(f"### New Model: {m.get('name', '?')}")
815
+ lines.append(f"- **Params:** {m.get('params', '?')}")
816
+ lines.append(f"- **Why:** {m.get('why', '')}")
817
+ lines.append(f"- **GURMA.ai fit:** {m.get('gurma_fit', '')}")
818
+ lines.append("")
819
+
820
+ for t in llm_update.get("new_techniques", []):
821
+ lines.append(f"### New Technique: {t.get('name', '?')}")
822
+ lines.append(f"- **Category:** {t.get('category', '?')}")
823
+ lines.append(f"- **Why:** {t.get('why', '')}")
824
+ lines.append(f"- **GURMA.ai fit:** {t.get('gurma_fit', '')}")
825
+ lines.append(f"- **Priority:** {t.get('priority', '?')}")
826
+ lines.append("")
827
+
828
+ if llm_update.get("new_action_items"):
829
+ lines.append("## New Action Items")
830
+ lines.append("")
831
+ for ai in llm_update["new_action_items"]:
832
+ lines.append(f"- [ ] {ai}")
833
+ lines.append("")
834
+
835
+ # Relevance-scored sections
836
+ if scored_sections:
837
+ lines.append("---")
838
+ lines.append("")
839
+ lines.append("## Relevance-Scored Sections")
840
+ lines.append("")
841
+ for s in scored_sections[:10]:
842
+ tags_str = ", ".join(s["tags"])
843
+ lines.append(f"**Score: {s['score']:.2f}** — tags: {tags_str}")
844
+ lines.append(f"> {s['text'][:300]}")
845
+ lines.append("")
846
+
847
+ with open(report_path, "w") as f:
848
+ f.write("\n".join(lines))
849
+
850
+ return report_path
tr_agents.py ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GURMA.ai Turkish Research Agents - v2 (Enhanced)
4
+
5
+ Two specialized agents for Turkish-language web research, enhanced with strategic
6
+ context for Gurma AI's market entry.
7
+
8
+ 1. **MaliMusavirAgent**: Researches company formation, tax, IP, and medical
9
+ device regulations.
10
+
11
+ 2. **FonArastirmaAgent**: Researches R&D funding, focusing on bilateral
12
+ Swiss-Turkish opportunities and leveraging the BAMA partnership.
13
+
14
+ Both agents search in Turkish and produce structured data for a Cursor agent
15
+ (e.g., Claude Opus) to synthesize into actionable reports.
16
+
17
+ Usage:
18
+ # Ensure gurma-context.md is in the same directory
19
+ python research.py mali # Full company formation research
20
+ python research.py fonlar -c tubitak # Specific funding category
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import json
26
+ import re
27
+ import time
28
+ from dataclasses import dataclass, field
29
+ from datetime import datetime
30
+ from pathlib import Path
31
+
32
+ try:
33
+ from .search import SearchService
34
+ from .config import RESEARCH_DIR
35
+ except ImportError:
36
+ from search import SearchService
37
+ from config import RESEARCH_DIR
38
+
39
+ # ============================================================
40
+ # Turkish Authoritative Source Domains
41
+ # ============================================================
42
+
43
+ TR_PRIMARY_SOURCES = {
44
+ "tubitak.gov.tr", "kosgeb.gov.tr", "sanayi.gov.tr",
45
+ "ticaret.gov.tr", "ticaretsicil.gov.tr", "gib.gov.tr",
46
+ "resmigazete.gov.tr", "mevzuat.gov.tr", "iskur.gov.tr",
47
+ "yatirimadestek.gov.tr", "teydeb.tubitak.gov.tr",
48
+ "teknokent.org.tr", "teknopark.gov.tr", # .org.tr is also common
49
+ "btk.gov.tr", "kvkk.gov.tr", "titck.gov.tr", # Turkish Medicines and Medical Devices Agency
50
+ "ailevecalisma.gov.tr", "sgk.gov.tr",
51
+ "invest.gov.tr", # Investment Office of Turkey
52
+ }
53
+
54
+
55
+ # ============================================================
56
+ # Agent 1: Mali Müşavir — Company Formation & Regulation
57
+ # ============================================================
58
+
59
+ MALI_CATEGORIES = {
60
+ "sirket_kurulum": {
61
+ "label": "Şirket Kuruluş Adımları",
62
+ "queries": [
63
+ "yabancı sermayeli teknoloji şirketi kuruluş adımları türkiye 2025 2026",
64
+ "limited şirket (ltd) ve anonim şirket (aş) kuruluş prosedürleri",
65
+ "türkiye'de şirket kurmak için gerekli belgeler ticaret sicil noter",
66
+ "isviçre merkezli bir şirketin türkiye'de şube veya yan kuruluş açması",
67
+ "online şirket kuruluşu mümkün mü MERSİS süreci",
68
+ ],
69
+ },
70
+ "sirket_turu": {
71
+ "label": "Şirket Türü Seçimi (AR-GE Odaklı)",
72
+ "queries": [
73
+ "ltd mi aş mi AR-GE ve yazılım şirketi için karşılaştırma 2026",
74
+ "anonim şirket ve limited şirket vergi ve sorumluluk farkları",
75
+ "yabancı ortaklı şirketler için en uygun şirket türü türkiye",
76
+ "devlet teşvikleri ve fonlara erişim için şirket türü önemli mi",
77
+ ],
78
+ },
79
+ "vergi_tesvik": {
80
+ "label": "Vergi ve Teşvikler (Teknoloji)",
81
+ "queries": [
82
+ "5746 sayılı AR-GE kanunu teşvikleri güncel 2025 2026",
83
+ "teknokent dışı AR-GE merkezi vergi avantajları",
84
+ "yazılım ve yapay zeka ihracatı vergi istisnaları türkiye",
85
+ "kurumlar vergisi ve KDV istisnası teknoloji şirketleri",
86
+ "SGK işveren primi desteği AR-GE personeli için",
87
+ ],
88
+ },
89
+ "teknokent_teknopark": {
90
+ "label": "Teknokent ve AR-GE Merkezleri",
91
+ "queries": [
92
+ "teknopark başvuru ve kabul kriterleri yapay zeka medikal cihaz",
93
+ "istanbul ankara izmir önde gelen teknoparklar ve uzmanlık alanları",
94
+ "teknopark avantajları vergi kira altyapı",
95
+ "AR-GE merkezi kurma şartları ve avantajları teknopark dışında",
96
+ "BAMA Teknoloji hangi teknoparkta yer alıyor",
97
+ ],
98
+ },
99
+ "maliyet_surec": {
100
+ "label": "Maliyet ve Süreç Takvimi",
101
+ "queries": [
102
+ "türkiye'de şirket kuruluş toplam maliyeti 2026 (noter harç sermaye)",
103
+ "şirket kuruluş süresi ortalama kaç gün 2026",
104
+ "kuruluş sonrası zorunlu adımlar (SGK vergi dairesi belediye)",
105
+ "aylık sabit giderler teknoloji şirketi (muhasebe bağkur sgk)",
106
+ ],
107
+ },
108
+ "ip_data_sovereignty": {
109
+ "label": "Fikri Mülkiyet ve Veri Mevzuatı",
110
+ "queries": [
111
+ "türkiye'de yazılım ve yapay zeka algoritması fikri mülkiyet koruması",
112
+ "KVKK (kişisel verilerin korunması kanunu) sağlık verileri yönetmeliği",
113
+ "sağlık verilerinin yurtdışına aktarımı KVKK izinler",
114
+ "anonimleştirilmiş veri ile AR-GE çalışması yasal çerçeve türkiye",
115
+ "isviçre-türkiye veri transferi anlaşmaları",
116
+ ],
117
+ },
118
+ "regulatory_medical": {
119
+ "label": "Medikal Cihaz Mevzuatı (AI Odaklı)",
120
+ "queries": [
121
+ "TİTCK yapay zeka tabanlı yazılımlar için medikal cihaz düzenlemesi",
122
+ "türkiye medikal cihaz yönetmeliği (MDR) ve CE işareti tanınırlığı",
123
+ "yapay zeka rehabilitasyon cihazları için klinik araştırma gereklilikleri türkiye",
124
+ "tıbbi cihaz kayıt ve onay süreci TİTCK ÜTS sistemi",
125
+ "yazılım bir tıbbi cihaz mıdır (SaMD) türkiye sınıflandırması",
126
+ ],
127
+ },
128
+ }
129
+
130
+ MALI_SYNTHESIS_QUESTIONS = {
131
+ "sirket_kurulum": [
132
+ "İsviçre merkezli Gurma AI için Türkiye'de bir yan kuruluş (subsidiary) kurmanın adımları nelerdir?",
133
+ "Gerekli ana belgeler nelerdir ve bu belgelerin İsviçre'den nasıl hazırlanması gerekir (apostil vb.)?",
134
+ "Sürecin ne kadarı uzaktan (online) yönetilebilir, hangi aşamalarda Türkiye'de fiziksel bulunma zorunludur?",
135
+ ],
136
+ "sirket_turu": [
137
+ "Gurma'nın AR-GE ve fon odaklı hedefleri için Ltd. mi A.Ş. mi daha mantıklı? Karar matrisi oluşturun.",
138
+ "Minimum sermaye gereksinimleri ve bu sermayenin blokesi/kullanımı nasıl işliyor?",
139
+ "Seçilen şirket türü, gelecekte yatırımcı alma veya hisse devri operasyonlarını nasıl etkiler?",
140
+ ],
141
+ "vergi_tesvik": [
142
+ "Gurma'nın yararlanabileceği temel vergi avantajları (Kurumlar, KDV, Gelir Vergisi Stopajı) nelerdir?",
143
+ "5746 sayılı kanun kapsamında, Teknopark içinde ve dışında olmanın avantaj/dezavantajları nelerdir?",
144
+ "Yapay zeka ve medikal cihaz ihracatı için özel bir vergi indirimi var mı?",
145
+ ],
146
+ "teknokent_teknopark": [
147
+ "Gurma'nın profiline (AI + Medikal Cihaz) en uygun 3 Teknopark hangisidir ve neden?",
148
+ "Teknopark'a kabul için proje başvurusunda nelere dikkat edilmeli? BAMA'nın deneyiminden nasıl yararlanılır?",
149
+ "Teknopark'ta yer almanın IP koruması ve veri güvenliği açısından ek bir avantajı var mı?",
150
+ ],
151
+ "maliyet_surec": [
152
+ "Bir A.Ş. kurmak için başlangıçta ne kadar bir bütçe ayrılmalı (minimum sermaye + masraflar)?",
153
+ "Şirketin yasal olarak faaliyete geçmesi ne kadar sürer? (iyimser ve kötümser senaryo)",
154
+ "Faaliyete geçtikten sonraki ilk 3 ay içinde tamamlanması gereken zorunlu işlemler nelerdir?",
155
+ ],
156
+ "ip_data_sovereignty": [
157
+ "Gurma'nın temel IP'si olan AI modellerini Türkiye'de nasıl koruma altına alabiliriz? (Patent, telif hakkı vb.)",
158
+ "KVKK uyarınca, rehabilitasyon verilerini işlerken nelere dikkat edilmeli? Veri Türkiye'de mi kalmalı?",
159
+ "İsviçre'deki ana şirket ile Türkiye'deki AR-GE birimi arasında veri (özellikle anonimleştirilmiş hasta verisi) transferi için yasal zemin nedir?",
160
+ ],
161
+ "regulatory_medical": [
162
+ "Gurma'nın AI yazılımı Türkiye'de bir 'tıbbi cihaz' olarak kabul edilecek mi? TİTCK'nın bu konudaki kriterleri nelerdir?",
163
+ "Eğer tıbbi cihaz ise, AB'den alınacak bir CE belgesi Türkiye'de doğrudan geçerli midir, yoksa ek TİTCK onayı gerekir mi?",
164
+ "Pazara sunmadan önce Türkiye'de bir klinik doğrulama/araştırma yapma zorunluluğu var mı?",
165
+ ],
166
+ }
167
+
168
+
169
+ # ============================================================
170
+ # Agent 2: Fon Araştırma — TÜBİTAK & Stratejik Ortaklık
171
+ # ============================================================
172
+
173
+ FON_CATEGORIES = {
174
+ "tubitak": {
175
+ "label": "TÜBİTAK Destek Programları",
176
+ "queries": [
177
+ "TÜBİTAK TEYDEB 1501 1507 destek programları yapay zeka medikal cihaz 2026",
178
+ "TÜBİTAK 1702 patent lisanslama desteği yabancı teknoloji",
179
+ "TÜBİTAK yapay zeka enstitüsü proje çağrıları",
180
+ "TÜBİTAK sağlık bilimleri araştırma destek grubu (SBAG) çağrıları",
181
+ "yeni kurulan teknoloji şirketleri için TÜBİTAK BİGG programı şartları",
182
+ ],
183
+ },
184
+ "kosgeb": {
185
+ "label": "KOSGEB Destekleri",
186
+ "queries": [
187
+ "KOSGEB AR-GE ÜR-GE ve inovasyon destek programı 2026",
188
+ "KOSGEB teknoloji odaklı sanayi hamlesi programı medikal cihaz",
189
+ "yabancı ortaklı KOBİ'ler KOSGEB desteklerinden yararlanabilir mi",
190
+ "KOSGEB stratejik ürün destek programı şartları",
191
+ ],
192
+ },
193
+ "sanayi_bakanligi": {
194
+ "label": "Sanayi Bakanlığı ve Kalkınma Ajansları",
195
+ "queries": [
196
+ "sanayi ve teknoloji bakanlığı yatırım teşvik belgesi AR-GE yatırımı",
197
+ "ulusal yapay zeka stratejisi 2025 kapsamında açılan fon ve destekler",
198
+ "kalkınma ajansları (ISTKA IZMIRKA) güdümlü proje desteği sağlık teknolojileri",
199
+ "teknoloji geliştirme bölgeleri (TGB) ek destek ve hibeler",
200
+ ],
201
+ },
202
+ "ab_fonlari": {
203
+ "label": "AB ve Uluslararası Fonlar (İsviçre-Türkiye Odaklı)",
204
+ "queries": [
205
+ "TÜBİTAK-SNSF (İsviçre) ikili işbirliği programı ve başvuru şartlar��",
206
+ "EUREKA Eurostars programı türkiye isviçre ortaklığı",
207
+ "Horizon Europe programına türkiye'den katılım ve yapay zeka sağlık çağrıları",
208
+ "İsviçre ve Türkiye'nin ortak katıldığı uluslararası AR-GE fonları",
209
+ ],
210
+ },
211
+ "basvuru_surec": {
212
+ "label": "Başvuru Süreçleri ve Stratejileri",
213
+ "queries": [
214
+ "TÜBİTAK TEYDEB proje önerisi hazırlama kılavuzu ve hakem değerlendirme kriterleri",
215
+ "başarılı bir TÜBİTAK projesi bütçesi nasıl hazırlanır (personel makine hizmet alımı)",
216
+ "TÜBİTAK proje başvurularında sık yapılan hatalar ve reddedilme nedenleri",
217
+ "proje yürütücüsü ve AR-GE personeli nitelikleri TÜBİTAK kriterleri",
218
+ ],
219
+ },
220
+ "bama_joint_strategy": {
221
+ "label": "BAMA Ortaklığı ile Stratejik Fon Başvurusu",
222
+ "queries": [
223
+ "TÜBİTAK ortaklı proje başvurusu nasıl yapılır (KOBİ-KOBİ işbirliği)",
224
+ "BAMA Teknoloji'nin 'yerli üretici' statüsü fon başvurularında avantaj sağlar mı",
225
+ "BAMA Teknoloji'nin tamamladığı TÜBİTAK veya KOSGEB projeleri var mı",
226
+ "Gurma (yeni) ve BAMA (tecrübeli) ortaklığında bir proje kurgusu nasıl olmalı",
227
+ ],
228
+ },
229
+ "competitor_analysis_tr": {
230
+ "label": "Türkiye'deki Rakiplerin Fon Geçmişi",
231
+ "queries": [
232
+ "rehabilitasyon robotiği alanında TÜBİTAK desteği alan türk firmaları",
233
+ "Hocoma Ekso Bionics gibi yabancı firmaların türkiye'de aldığı teşvik var mı",
234
+ "medikal cihaz ve yazılım alanında başarılı KOSGEB projesi örnekleri",
235
+ "türkiye'de sağlık teknolojileri alanında yatırım alan startuplar ve aldıkları hibeler",
236
+ ],
237
+ },
238
+ }
239
+
240
+ FON_SYNTHESIS_QUESTIONS = {
241
+ "tubitak": [
242
+ "Gurma'nın mevcut durumu (yeni kuruluş, AI/medikal odaklı) için en uygun 2-3 TÜBİTAK programı hangisidir?",
243
+ "Bu programların sağladığı hibe oranı, toplam bütçe ve proje süresi nedir?",
244
+ "Yakın zamanda açılacak veya şu an açık olan özel bir 'yapay zeka' veya 'sağlık teknolojileri' çağrısı var mı?",
245
+ ],
246
+ "kosgeb": [
247
+ "Yeni kurulacak yabancı ortaklı bir şirket, KOSGEB'in hangi desteklerinden faydalanabilir?",
248
+ "KOSGEB mi TÜBİTAK mı? Gurma'nın AR-GE projesi için hangisi daha uygun bir başlangıç noktasıdır?",
249
+ ],
250
+ "sanayi_bakanligi": [
251
+ "'Yatırım Teşvik Belgesi' almanın Gurma için en somut faydaları neler olur? Süreç ne kadar karmaşık?",
252
+ "İstanbul veya İzmir'deki Kalkınma Ajansları, Gurma'nın projesine özel bir destek sağlayabilir mi?",
253
+ ],
254
+ "ab_fonlari": [
255
+ "**En Önemli Soru**: TÜBİTAK-SNSF (İsviçre) ikili işbirliği programının güncel durumu nedir? Başvuru tarihleri ve başarı oranları hakkında ne biliniyor?",
256
+ "Gurma (İsviçre) ve Gurma (Türkiye) arasında bir EUREKA projesi kurgulamak mümkün ve mantıklı mıdır?",
257
+ ],
258
+ "basvuru_surec": [
259
+ "Bir TÜBİTAK 1501 proje başvurusunun ana adımları ve zaman çizelgesi nedir?",
260
+ "Hakemlerin projeyi değerlendirirken en çok dikkat ettiği 3 kritik nokta nedir? (Örn: İnovatif yön, ticarileşme potansiyeli)",
261
+ "Proje bütçesinde hangi harcamalar desteklenir, hangileri desteklenmez?",
262
+ ],
263
+ "bama_joint_strategy": [
264
+ "Gurma ve BAMA'nın birlikte başvurabileceği en mantıklı fon hangisidir? Bu ortaklık başvuruda nasıl bir avantaj yaratır?",
265
+ "BAMA'nın mevcut deneyimi ve 'yerli üretici' kimliği, proje kabul şansını ne kadar artırır?",
266
+ "Ortak bir projede IP (fikri mülkiyet) paylaşımı nasıl düzenlenmelidir?",
267
+ ],
268
+ "competitor_analysis_tr": [
269
+ "Türkiye'de rehabilitasyon teknolojileri alanında kimler devlet desteği alıyor? Bu projelerin odak noktaları neler?",
270
+ "Rakiplerin aldığı destekler, pazarın hangi yöne gittiğini ve hangi teknolojilerin önceliklendirildiğini gösteriyor mu?",
271
+ ],
272
+ }
273
+
274
+
275
+ # ============================================================
276
+ # Shared Dataclasses and Base Agent
277
+ # (This section is largely unchanged from the original v1)
278
+ # ============================================================
279
+
280
+ @dataclass
281
+ class TRIntelSection:
282
+ """A section of the Turkish research report."""
283
+ category: str
284
+ label: str
285
+ queries_executed: list = field(default_factory=list)
286
+ results: list = field(default_factory=list)
287
+ findings: list = field(default_factory=list)
288
+ gaps: list = field(default_factory=list)
289
+ sources: list = field(default_factory=list)
290
+
291
+ class TRResearchAgent:
292
+ """Base agent for Turkish-language structured web research."""
293
+
294
+ CATEGORIES: dict = {}
295
+ SYNTHESIS_QUESTIONS: dict = {}
296
+ OUTPUT_SUBDIR: str = "tr"
297
+ REPORT_TITLE: str = "Türkçe Araştırma Raporu"
298
+
299
+ def __init__(self, search: SearchService = None):
300
+ self.search = search or SearchService()
301
+ self.sections: dict[str, TRIntelSection] = {}
302
+ self.output_dir = RESEARCH_DIR / self.OUTPUT_SUBDIR
303
+ self.output_dir.mkdir(parents=True, exist_ok=True)
304
+
305
+ def run(self, categories: list[str] = None, delay: float = 1.0, max_results: int = 10) -> Path:
306
+ cats_to_run = categories or list(self.CATEGORIES.keys())
307
+
308
+ total_queries = sum(len(self.CATEGORIES[c]["queries"]) for c in cats_to_run if c in self.CATEGORIES)
309
+
310
+ print(f"\n{'='*60}")
311
+ print(f"{self.REPORT_TITLE}")
312
+ print(f"Kategoriler: {len(cats_to_run)} | Sorgular: ~{total_queries}")
313
+ print(f"{'='*60}\n")
314
+
315
+ for cat_key in cats_to_run:
316
+ if cat_key not in self.CATEGORIES:
317
+ print(f"[ATLA] Bilinmeyen kategori: {cat_key}")
318
+ continue
319
+
320
+ cat_details = self.CATEGORIES[cat_key]
321
+ section = TRIntelSection(category=cat_key, label=cat_details["label"])
322
+ self._research_category(section, cat_details, delay, max_results)
323
+ self.sections[cat_key] = section
324
+
325
+ report_path = self._generate_report()
326
+ self._save_data()
327
+
328
+ print(f"\n{'='*60}")
329
+ print(f"Rapor ve Veri Dosyaları Oluşturuldu: {self.output_dir}")
330
+ total_findings = sum(len(s.findings) for s in self.sections.values())
331
+ print(f"Toplam Bulgular: {total_findings}")
332
+ print(f"{'='*60}\n")
333
+
334
+ return report_path
335
+
336
+ def _research_category(self, section: TRIntelSection, cat_details: dict, delay: float, max_results: int):
337
+ print(f"\n--- Kategori: {section.label} ---")
338
+
339
+ for query in cat_details["queries"]:
340
+ print(f" [ARAMA] {query}")
341
+ try:
342
+ results = self.search.search(query, max_results=max_results, save=False) # Disable saving intermediate results for now
343
+ section.queries_executed.append(query)
344
+ section.results.extend(results)
345
+ section.sources.extend(r.url for r in results if r.url and r.url not in section.sources)
346
+ print(f" -> {len(results)} sonuç bulundu.")
347
+ except Exception as e:
348
+ print(f" -> Arama sırasında hata: {e}")
349
+
350
+ if delay > 0:
351
+ time.sleep(delay)
352
+
353
+ section.findings = self._analyze_and_deduplicate(section)
354
+ section.gaps = self._detect_gaps(section)
355
+
356
+ confirmed_count = sum(1 for f in section.findings if f.get("confirmed"))
357
+ print(f" [ANALİZ] {len(section.findings)} özgün bulgu ({confirmed_count} resmi kaynaklı). Gaps: {len(section.gaps)}")
358
+
359
+ def _analyze_and_deduplicate(self, section: TRIntelSection) -> list[dict]:
360
+ findings = []
361
+ seen_snippets = set()
362
+
363
+ for r in sorted(section.results, key=lambda x: x.url):
364
+ # Basic deduplication based on snippet
365
+ snippet_key = re.sub(r'[^a-z0-9]', '', r.snippet.lower()[:100])
366
+ if snippet_key in seen_snippets:
367
+ continue
368
+ seen_snippets.add(snippet_key)
369
+
370
+ is_primary = any(domain in r.url for domain in TR_PRIMARY_SOURCES) if r.url else False
371
+
372
+ findings.append({
373
+ "text": f"{r.title}: {r.snippet}",
374
+ "confirmed": is_primary,
375
+ "source": r.url or "",
376
+ })
377
+ return findings[:25] # Limit findings per section
378
+
379
+ def _detect_gaps(self, section: TRIntelSection) -> list[dict]:
380
+ questions = self.SYNTHESIS_QUESTIONS.get(section.category, [])
381
+ if not questions:
382
+ return []
383
+
384
+ all_text = " ".join(f["text"].lower() for f in section.findings)
385
+
386
+ gaps = []
387
+ for q in questions:
388
+ # Simple keyword matching to detect gaps
389
+ keywords = [w for w in re.findall(r'\w{4,}', q.lower()) if len(w) > 3]
390
+ matches = sum(1 for kw in keywords if kw in all_text)
391
+ if not all_text or matches < len(keywords) * 0.2:
392
+ gaps.append({"text": q})
393
+ return gaps
394
+
395
+ def _generate_report(self) -> Path:
396
+ # This is a lightweight data dump; main report is synthesized by Cursor
397
+ timestamp = datetime.now().strftime("%Y-%m-%d")
398
+ slug = self.OUTPUT_SUBDIR.replace("/", "-")
399
+ report_path = self.output_dir / f"{slug}-rapor-{timestamp}.md"
400
+
401
+ lines = [f"# {self.REPORT_TITLE} - Veri Dökümü", f"Tarih: {datetime.now().isoformat()}", ""]
402
+
403
+ for cat_key, section in self.sections.items():
404
+ lines.extend([f"## {section.label}", ""])
405
+ lines.append("### Bulgular")
406
+ for f in section.findings:
407
+ tag = "✅" if f.get("confirmed") else "⚠️"
408
+ lines.append(f"- {tag} {f.get('text', '')} ([Kaynak]({f.get('source', '#')}))")
409
+ lines.append("\n### Cevaplanması Gereken Sentez Soruları")
410
+ for q in self.SYNTHESIS_QUESTIONS.get(cat_key, []):
411
+ lines.append(f"- {q}")
412
+ lines.append("")
413
+
414
+ report_path.write_text("\n".join(lines), encoding="utf-8")
415
+ return report_path
416
+
417
+ def _save_data(self):
418
+ timestamp = datetime.now().strftime("%Y-%m-%d")
419
+ slug = self.OUTPUT_SUBDIR.replace("/", "-")
420
+ data_path = self.output_dir / f"{slug}-data-{timestamp}.json"
421
+
422
+ data = {
423
+ "agent": self.__class__.__name__,
424
+ "timestamp": datetime.now().isoformat(),
425
+ "sections": {
426
+ key: {
427
+ "category": s.category,
428
+ "label": s.label,
429
+ "queries_executed": s.queries_executed,
430
+ "findings": s.findings,
431
+ "gaps": s.gaps,
432
+ "sources": s.sources,
433
+ "synthesis_questions": self.SYNTHESIS_QUESTIONS.get(key, []),
434
+ }
435
+ for key, s in self.sections.items()
436
+ },
437
+ }
438
+ data_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
439
+
440
+ def list_categories(self):
441
+ print("Mevcut kategoriler:")
442
+ for key, cat in self.CATEGORIES.items():
443
+ print(f" - {key}: {cat['label']}")
444
+
445
+ # ============================================================
446
+ # Agent Implementations
447
+ # ============================================================
448
+
449
+ class MaliMusavirAgent(TRResearchAgent):
450
+ """Researches company formation, tax, IP, and medical device regulations."""
451
+ CATEGORIES = MALI_CATEGORIES
452
+ SYNTHESIS_QUESTIONS = MALI_SYNTHESIS_QUESTIONS
453
+ OUTPUT_SUBDIR = "tr-mali"
454
+ REPORT_TITLE = "Mali Müşavir Raporu — Şirket Kuruluşu ve Mevzuat"
455
+
456
+ class FonArastirmaAgent(TRResearchAgent):
457
+ """Researches R&D funding, focusing on bilateral Swiss-Turkish opportunities."""
458
+ CATEGORIES = FON_CATEGORIES
459
+ SYNTHESIS_QUESTIONS = FON_SYNTHESIS_QUESTIONS
460
+ OUTPUT_SUBDIR = "tr-fonlar"
461
+ REPORT_TITLE = "Fon Araştırma Raporu — TÜBİTAK ve Stratejik Destekler"
462
+
463
+ if __name__ == "__main__":
464
+ import argparse as _ap
465
+
466
+ parser = _ap.ArgumentParser(description="GURMA.ai Turkish Research Agents")
467
+ parser.add_argument("agent", choices=["mali", "fonlar"], help="Agent to run")
468
+ parser.add_argument("-c", "--categories", nargs="+", help="Specific categories to run")
469
+ parser.add_argument("-d", "--delay", type=float, default=1.0, help="Delay between searches")
470
+ parser.add_argument("--list-categories", action="store_true")
471
+
472
+ args = parser.parse_args()
473
+
474
+ agent_map = {"mali": MaliMusavirAgent, "fonlar": FonArastirmaAgent}
475
+ agent_instance = agent_map[args.agent]()
476
+
477
+ if args.list_categories:
478
+ agent_instance.list_categories()
479
+ else:
480
+ agent_instance.run(categories=args.categories, delay=args.delay)
tr_tab.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GURMA.ai — Turkey Expansion Tab
3
+
4
+ Displays research results from Mali and Fonlar agents,
5
+ plus executive summary reports.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+
13
+ import streamlit as st
14
+
15
+ # ============================================================
16
+ # Environment & Paths
17
+ # ============================================================
18
+
19
+ IS_HF_SPACE = os.getenv("HF_SPACE") or Path("/app/research.py").exists()
20
+
21
+ if IS_HF_SPACE:
22
+ DATA_ROOT = Path("/app/data")
23
+ DOCS_ROOT = Path("/app/docs")
24
+ else:
25
+ DATA_ROOT = Path(__file__).resolve().parent.parent.parent / "data"
26
+ DOCS_ROOT = Path(__file__).resolve().parent.parent.parent / "docs"
27
+
28
+ TR_MALI_DIR = DATA_ROOT / "tr-mali"
29
+ TR_FONLAR_DIR = DATA_ROOT / "tr-fonlar"
30
+
31
+ AGENT_CONFIG = {
32
+ "tr-mali": {
33
+ "label": "Mali Müşavir",
34
+ "icon": "🏛️",
35
+ "dir": TR_MALI_DIR,
36
+ "description": "Company formation, tax, IP, regulatory",
37
+ },
38
+ "tr-fonlar": {
39
+ "label": "Fon Araştırma",
40
+ "icon": "💰",
41
+ "dir": TR_FONLAR_DIR,
42
+ "description": "TÜBİTAK, KOSGEB, EU/bilateral funding",
43
+ },
44
+ }
45
+
46
+
47
+ # ============================================================
48
+ # Data Loading
49
+ # ============================================================
50
+
51
+
52
+ @st.cache_data(ttl=120)
53
+ def _load_tr_data(agent_key: str) -> list[dict]:
54
+ """Load all JSON data files for a TR agent, newest first."""
55
+ agent_dir = AGENT_CONFIG[agent_key]["dir"]
56
+ if not agent_dir.exists():
57
+ return []
58
+
59
+ results = []
60
+ for f in sorted(agent_dir.glob("*.json"), reverse=True):
61
+ try:
62
+ with open(f) as fh:
63
+ data = json.load(fh)
64
+ data["_filename"] = f.name
65
+ results.append(data)
66
+ except Exception:
67
+ continue
68
+ return results
69
+
70
+
71
+ @st.cache_data(ttl=120)
72
+ def _load_exec_summaries() -> list[dict]:
73
+ """Load executive summary markdown files, newest first."""
74
+ if not DOCS_ROOT.exists():
75
+ return []
76
+
77
+ summaries = []
78
+ for f in sorted(DOCS_ROOT.glob("exec-summary-*.md"), reverse=True):
79
+ try:
80
+ content = f.read_text(encoding="utf-8")
81
+ title_line = ""
82
+ for line in content.split("\n"):
83
+ if line.startswith("# "):
84
+ title_line = line[2:].strip()
85
+ break
86
+ summaries.append({
87
+ "filename": f.name,
88
+ "title": title_line or f.stem,
89
+ "content": content,
90
+ "mtime": datetime.fromtimestamp(f.stat().st_mtime),
91
+ })
92
+ except Exception:
93
+ continue
94
+ return summaries
95
+
96
+
97
+ # ============================================================
98
+ # Rendering Helpers
99
+ # ============================================================
100
+
101
+
102
+ def _render_finding(finding: dict):
103
+ """Render a single research finding with source quality indicator."""
104
+ text = finding.get("text", "")
105
+ source = finding.get("source", "")
106
+ confirmed = finding.get("confirmed", False)
107
+
108
+ if not text or len(text.strip()) < 20:
109
+ return
110
+
111
+ color = "#2ecc71" if confirmed else "#e67e22"
112
+ tag = "✅" if confirmed else "⚠️"
113
+ domain = ""
114
+ if source:
115
+ try:
116
+ from urllib.parse import urlparse
117
+ domain = urlparse(source).netloc
118
+ if domain.startswith("www."):
119
+ domain = domain[4:]
120
+ except Exception:
121
+ domain = source[:40]
122
+
123
+ truncated = text[:250] + "..." if len(text) > 250 else text
124
+ source_html = f" <a href='{source}' style='color:#888;font-size:0.75em;'>{domain}</a>" if source else ""
125
+ st.markdown(
126
+ f"{tag} <span style='font-size:0.88em;'>{truncated}</span>{source_html}",
127
+ unsafe_allow_html=True,
128
+ )
129
+
130
+
131
+ def _render_section(section: dict):
132
+ """Render a research section (category) with findings and gaps."""
133
+ label = section.get("label", section.get("category", "Unknown"))
134
+ findings = section.get("findings", [])
135
+ gaps = section.get("gaps", [])
136
+ synthesis_qs = section.get("synthesis_questions", [])
137
+
138
+ confirmed_count = sum(1 for f in findings if isinstance(f, dict) and f.get("confirmed"))
139
+ total = len(findings)
140
+
141
+ header = f"**{label}** — {total} findings"
142
+ if confirmed_count:
143
+ header += f" ({confirmed_count} official)"
144
+ if gaps:
145
+ header += f" · {len(gaps)} gaps"
146
+
147
+ with st.expander(header, expanded=False):
148
+ if synthesis_qs:
149
+ st.caption("**Key questions:** " + " · ".join(synthesis_qs))
150
+ st.markdown("")
151
+
152
+ for f in findings[:12]:
153
+ if isinstance(f, dict):
154
+ _render_finding(f)
155
+
156
+ if len(findings) > 12:
157
+ st.caption(f"... and {len(findings) - 12} more findings")
158
+
159
+ if gaps:
160
+ st.markdown("---")
161
+ for g in gaps:
162
+ gap_text = g.get("text", g) if isinstance(g, dict) else g
163
+ st.caption(f"🔍 **Gap:** {gap_text}")
164
+
165
+
166
+ def _render_agent_data(agent_key: str, data_files: list[dict]):
167
+ """Render all data for one TR agent."""
168
+ config = AGENT_CONFIG[agent_key]
169
+
170
+ if not data_files:
171
+ st.info(f"No data files found in `data/{agent_key}/`. Run the agent first.")
172
+ return
173
+
174
+ latest = data_files[0]
175
+ timestamp = latest.get("timestamp", "")[:16].replace("T", " ")
176
+ sections = latest.get("sections", {})
177
+
178
+ st.caption(f"Latest run: {timestamp} · {len(sections)} categories · File: `{latest.get('_filename', '')}`")
179
+
180
+ for section_data in sections.values():
181
+ _render_section(section_data)
182
+
183
+
184
+ # ============================================================
185
+ # Main Entry Point
186
+ # ============================================================
187
+
188
+
189
+ def render_tr_tab():
190
+ """Main entry point — called from app.py."""
191
+ st.title("Turkey Expansion")
192
+ st.caption("Company formation research & funding intelligence for Gurma Turkey")
193
+
194
+ # --- Executive Summaries ---
195
+ summaries = _load_exec_summaries()
196
+ if summaries:
197
+ st.header("Executive Summaries")
198
+ for s in summaries:
199
+ age = datetime.now() - s["mtime"]
200
+ age_label = "today" if age.days == 0 else f"{age.days}d ago"
201
+ with st.expander(f"📋 {s['title']} ({age_label})", expanded=len(summaries) <= 2):
202
+ st.markdown(s["content"])
203
+
204
+ st.divider()
205
+
206
+ # --- Agent Research Data ---
207
+ st.header("Research Data")
208
+
209
+ agent_tabs = st.tabs([
210
+ f"{cfg['icon']} {cfg['label']}" for cfg in AGENT_CONFIG.values()
211
+ ])
212
+
213
+ for tab, agent_key in zip(agent_tabs, AGENT_CONFIG.keys()):
214
+ with tab:
215
+ config = AGENT_CONFIG[agent_key]
216
+ st.caption(config["description"])
217
+ data_files = _load_tr_data(agent_key)
218
+ _render_agent_data(agent_key, data_files)