thearn commited on
Commit
412abad
·
1 Parent(s): 1fdd88e

request working

Browse files
Files changed (1) hide show
  1. src/app.py +70 -179
src/app.py CHANGED
@@ -1,10 +1,26 @@
1
  import streamlit as st
2
  import asyncio
3
- import aiohttp
4
- from bs4 import BeautifulSoup
5
- from typing import Dict, Any, Optional, List, Set, cast
 
6
 
7
- CacheType = Dict[int, Dict[str, Any]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def get_id_from_input(val: str) -> Optional[int]:
10
  try:
@@ -12,186 +28,67 @@ def get_id_from_input(val: str) -> Optional[int]:
12
  except Exception:
13
  return None
14
 
15
- async def fetch_record(
16
- record_id: int,
17
- session: aiohttp.ClientSession,
18
- cache: Optional[CacheType],
19
- delay: float = 0.0,
20
- ) -> Optional[Dict[str, Any]]:
21
- if cache is not None and record_id in cache:
22
- return cache[record_id]
23
- url = f"https://www.mathgenealogy.org/id.php?id={record_id}"
24
- async with session.get(url) as resp:
25
- html = await resp.text()
26
- if delay > 0:
27
- await asyncio.sleep(delay)
28
- soup = BeautifulSoup(html, "html.parser")
29
- if soup.string == "Non-numeric id supplied. Aborting.":
30
- return None
31
- if (
32
- soup.p is not None
33
- and soup.p.string
34
- == "You have specified an ID that does not exist in the database. Please back up and try again."
35
- ):
36
- return None
37
- h2 = soup.find("h2")
38
- name = h2.get_text(strip=True) if h2 and hasattr(h2, "get_text") else str(record_id)
39
- institution = None
40
- year = None
41
- for inst in soup.find_all("div", style="line-height: 30px; text-align: center; margin-bottom: 1ex"):
42
- if hasattr(inst, "find"):
43
- span1 = inst.find("span")
44
- if span1 and hasattr(span1, "find"):
45
- span2 = span1.find("span")
46
- if span2 and hasattr(span2, "text"):
47
- institution = span2.text
48
- if span1 and hasattr(span1, "contents") and span1.contents:
49
- y = span1.contents[-1]
50
- if isinstance(y, str):
51
- y = y.strip()
52
- if y:
53
- y = y.split(",")[0].strip()
54
- if y.isdigit():
55
- year = int(y)
56
- advisors = []
57
- for a in soup.find_all(string=lambda s: s and ("Advisor" in cast(str, s) or "Promotor" in cast(str, s))):
58
- # Ensure 'a' itself is treated as a string for the 'in' check
59
- if "Advisor: Unknown" in str(a):
60
- continue
61
- next_tag = a.find_next() if hasattr(a, "find_next") else None
62
- if next_tag and hasattr(next_tag, "attrs") and "href" in next_tag.attrs:
63
- try:
64
- advisors.append(int(next_tag.attrs["href"].split("=")[-1]))
65
- except Exception:
66
- pass
67
- table = soup.find("table")
68
- descendants = []
69
- if table and hasattr(table, "find_all"):
70
- for a in table.find_all("a"):
71
- if hasattr(a, "attrs") and "href" in a.attrs:
72
- try:
73
- descendants.append(int(a.attrs["href"].split("=")[-1]))
74
- except Exception:
75
- pass
76
- record = {
77
- "id": record_id,
78
- "name": name,
79
- "institution": institution,
80
- "year": year,
81
- "advisors": advisors,
82
- "descendants": descendants,
83
  }
84
- if cache is not None:
85
- cache[record_id] = record
86
- return record
87
 
88
- async def fetch_advisors_parallel(
89
- rid: int,
90
- session: aiohttp.ClientSession,
91
- cache: Optional[CacheType],
92
- semaphore: asyncio.Semaphore,
93
- tree: Dict[int, Dict[str, Any]],
94
- progress_cb,
95
- visited: Set[int],
96
- delay: float = 0.0,
97
- ):
98
- if rid in visited:
99
- return
100
- visited.add(rid)
101
- async with semaphore:
102
- rec = await fetch_record(rid, session, cache, delay)
103
- if rec is not None:
104
- tree[rid] = rec
105
- await progress_cb(tree)
106
- tasks = [
107
- fetch_advisors_parallel(
108
- adv, session, cache, semaphore, tree, progress_cb, visited, delay
109
- )
110
- for adv in rec["advisors"]
111
- ]
112
- if tasks:
113
- await asyncio.gather(*tasks)
114
 
115
- def tree_to_dot(tree: Dict[int, Dict[str, Any]]) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  lines = [
117
  "digraph G {",
118
- " rankdir=TB; // Top to Bottom layout, could also be LR (Left to Right)",
119
- ' node [shape=box, style="rounded,filled", fillcolor=lightyellow]; // Optional: style nodes',
120
  ' edge [arrowhead=vee];'
121
  ]
122
-
123
- # Group nodes by year
124
- nodes_by_year: Dict[Optional[int], List[int]] = {}
125
- for node_id, node_data in tree.items():
126
- year = node_data.get("year") # Might be None
127
- if year not in nodes_by_year:
128
- nodes_by_year[year] = []
129
- nodes_by_year[year].append(node_id)
130
-
131
  # Define nodes and their labels
132
- for node_id, node in tree.items():
133
  name = node.get("name", str(node_id))
134
  year_str = f" ({node.get('year')})" if node.get('year') is not None else " (Year Unknown)"
135
  label = f"{name}{year_str}"
136
- # Add a tooltip with more info if possible (Graphviz might not render HTML tooltips in all viewers)
137
- tooltip = f"ID: {node_id}\nName: {name}\nYear: {node.get('year', 'N/A')}\nInstitution: {node.get('institution', 'N/A')}"
138
  lines.append(f' "{node_id}" [label="{label}", tooltip="{tooltip}"];')
139
-
140
  # Define edges
141
- for node_id, node in tree.items():
142
  for adv_id in node.get("advisors", []):
143
- if adv_id in tree:
144
  lines.append(f' "{adv_id}" -> "{node_id}";')
145
-
146
- # Add rank constraints for years
147
- # Sort years, placing None (unknown year) perhaps at the top or bottom.
148
- # For this example, None years will not be part of explicit rank=same groups.
149
- # Dot will place them based on connections.
150
- # Consider creating a specific rank for "Unknown Year" if desired.
151
-
152
- # Sort known years.
153
- # Years are typically displayed with earlier years at the top (if rankdir=TB)
154
- # or left (if rankdir=LR). Dot usually handles this naturally with directed edges.
155
- # The rank=same constraint is more about aligning nodes *within* the same year.
156
- sorted_known_years = sorted([y for y in nodes_by_year.keys() if y is not None])
157
-
158
- for year in sorted_known_years:
159
- nodes_in_year = nodes_by_year[year]
160
- if len(nodes_in_year) > 0: # Create a rank group even for single nodes if you want to ensure year separation
161
- # Invisible subgraph for ranking
162
- lines.append(f" subgraph cluster_year_{year} {{") # Naming cluster helps visually if you add style
163
- lines.append(" label=\"\"; // No visible label for the cluster itself")
164
- lines.append(" style=invis;")
165
- lines.append(" {")
166
- lines.append(" rank=same;")
167
- # Add nodes to the rank group
168
- for node_id in nodes_in_year:
169
- lines.append(f' "{node_id}";')
170
- lines.append(" }")
171
- lines.append(" }")
172
-
173
- # Handle nodes with unknown years (year is None)
174
- # Option 1: Let them float (current behavior if not explicitly ranked)
175
- # Option 2: Group them in their own rank (e.g., at the top or bottom)
176
- if None in nodes_by_year and nodes_by_year[None]:
177
- lines.append(" subgraph cluster_year_unknown {")
178
- lines.append(" label=\"\"; style=invis;")
179
- lines.append(" {")
180
- lines.append(" rank=same; // Or rank=min/max if you want them at very top/bottom")
181
- for node_id in nodes_by_year[None]:
182
- lines.append(f' "{node_id}";')
183
- lines.append(" }")
184
- lines.append(" }")
185
-
186
  lines.append("}")
187
  return "\n".join(lines)
188
 
189
  def main():
190
- st.title("Math Genealogy Ancestor Tree (Async Parallel Scrape)")
191
  mgp_id_str = st.text_input("Enter MGP ID (integer):")
192
- use_cache = st.checkbox("Use in-memory cache", value=True)
193
- concurrency = st.slider("Max parallel requests", min_value=1, max_value=10, value=5)
194
- delay = st.slider("Delay between requests (seconds)", min_value=0.0, max_value=2.0, value=0.2, step=0.05)
195
  progress_placeholder = st.empty()
196
  graph_placeholder = st.empty()
197
  run_btn = st.button("Show Ancestor Tree")
@@ -200,28 +97,22 @@ def main():
200
  if mgp_id is None:
201
  st.error("Please enter a valid integer MGP ID.")
202
  return
203
- cache: Optional[CacheType] = {} if use_cache else None
204
  loop = asyncio.new_event_loop()
205
  asyncio.set_event_loop(loop)
206
- progress_nodes: List[int] = []
207
- async def progress_cb(tree):
208
- progress_nodes.clear()
209
- progress_nodes.extend(tree.keys())
210
- progress_placeholder.info(f"Ancestors found: {len(tree)}")
211
- dot = tree_to_dot(tree)
212
- graph_placeholder.graphviz_chart(dot)
213
  async def runner():
214
- async with aiohttp.ClientSession() as session:
215
- semaphore = asyncio.Semaphore(concurrency)
216
- tree: Dict[int, Dict[str, Any]] = {}
217
- visited: Set[int] = set()
218
- await fetch_advisors_parallel(
219
- mgp_id, session, cache, semaphore, tree, progress_cb, visited, delay
220
- )
221
  try:
222
  loop.run_until_complete(runner())
223
  progress_placeholder.success("Done!")
224
  except Exception as e:
 
225
  progress_placeholder.error(f"Error: {e}")
226
 
227
  if __name__ == "__main__":
 
1
  import streamlit as st
2
  import asyncio
3
+ import websockets
4
+ import json
5
+ import platform
6
+ from typing import Dict, Any, Optional, List, Literal, TypedDict, cast
7
 
8
+ GGRAPHER_URI = "wss://ggrphr.davidalber.net"
9
+
10
+ class StartNodeRequest(TypedDict):
11
+ recordId: int
12
+ getAdvisors: bool
13
+ getDescendants: bool
14
+
15
+ class RequestPayload(TypedDict):
16
+ kind: Literal["build-graph"]
17
+ options: Dict[Literal["reportingCallback"], bool]
18
+ startNodes: List[StartNodeRequest]
19
+
20
+ class ProgressCallback(TypedDict):
21
+ queued: int
22
+ fetching: int
23
+ done: int
24
 
25
  def get_id_from_input(val: str) -> Optional[int]:
26
  try:
 
28
  except Exception:
29
  return None
30
 
31
+ def make_payload(record_id: int) -> RequestPayload:
32
+ return {
33
+ "kind": "build-graph",
34
+ "options": {"reportingCallback": True},
35
+ "startNodes": [{
36
+ "recordId": record_id,
37
+ "getAdvisors": True,
38
+ "getDescendants": False,
39
+ }],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  }
 
 
 
41
 
42
+ async def get_graph(payload: RequestPayload, progress_cb=None) -> Dict[str, Any]:
43
+ def intify_record_keys(d: Dict[Any, Any]) -> Dict[Any, Any]:
44
+ if "nodes" in d:
45
+ ret = {k: v for k, v in d.items() if k != "nodes"}
46
+ ret["nodes"] = {int(k): v for k, v in d["nodes"].items()}
47
+ return ret
48
+ return d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ async with websockets.connect( # type: ignore[attr-defined]
51
+ GGRAPHER_URI,
52
+ ) as ws:
53
+ await ws.send(json.dumps(payload))
54
+ while True:
55
+ response_json = await ws.recv()
56
+ response = json.loads(response_json, object_hook=intify_record_keys)
57
+ response_payload = response.get("payload")
58
+ if response["kind"] == "graph":
59
+ return cast(Dict[str, Any], response_payload)
60
+ elif response["kind"] == "progress" and progress_cb:
61
+ progress = cast(ProgressCallback, response_payload)
62
+ progress_cb(progress)
63
+ else:
64
+ continue
65
+
66
+ def tree_to_dot(graph: Dict[str, Any]) -> str:
67
+ nodes = graph.get("nodes", {})
68
  lines = [
69
  "digraph G {",
70
+ " rankdir=TB;",
71
+ ' node [shape=box, style="rounded,filled", fillcolor=lightyellow];',
72
  ' edge [arrowhead=vee];'
73
  ]
 
 
 
 
 
 
 
 
 
74
  # Define nodes and their labels
75
+ for node_id, node in nodes.items():
76
  name = node.get("name", str(node_id))
77
  year_str = f" ({node.get('year')})" if node.get('year') is not None else " (Year Unknown)"
78
  label = f"{name}{year_str}"
79
+ tooltip = f"ID: {node_id}\\nName: {name}\\nYear: {node.get('year', 'N/A')}\\nInstitution: {node.get('institution', 'N/A')}"
 
80
  lines.append(f' "{node_id}" [label="{label}", tooltip="{tooltip}"];')
 
81
  # Define edges
82
+ for node_id, node in nodes.items():
83
  for adv_id in node.get("advisors", []):
84
+ if adv_id in nodes:
85
  lines.append(f' "{adv_id}" -> "{node_id}";')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  lines.append("}")
87
  return "\n".join(lines)
88
 
89
  def main():
90
+ st.title("Math Genealogy Ancestor Tree (WebSocket API)")
91
  mgp_id_str = st.text_input("Enter MGP ID (integer):")
 
 
 
92
  progress_placeholder = st.empty()
93
  graph_placeholder = st.empty()
94
  run_btn = st.button("Show Ancestor Tree")
 
97
  if mgp_id is None:
98
  st.error("Please enter a valid integer MGP ID.")
99
  return
100
+ payload = make_payload(mgp_id)
101
  loop = asyncio.new_event_loop()
102
  asyncio.set_event_loop(loop)
103
+ def progress_cb(progress):
104
+ progress_placeholder.info(
105
+ f"Queued: {progress['queued']} | Fetching: {progress['fetching']} | Done: {progress['done']}"
106
+ )
 
 
 
107
  async def runner():
108
+ graph = await get_graph(payload, progress_cb)
109
+ dot = tree_to_dot(graph)
110
+ graph_placeholder.graphviz_chart(dot)
 
 
 
 
111
  try:
112
  loop.run_until_complete(runner())
113
  progress_placeholder.success("Done!")
114
  except Exception as e:
115
+ print(f"Error: {e}")
116
  progress_placeholder.error(f"Error: {e}")
117
 
118
  if __name__ == "__main__":