andrehoffmann80 commited on
Commit
092f5ec
·
verified ·
1 Parent(s): 4ce4ec2

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +30 -4
src/streamlit_app.py CHANGED
@@ -1,10 +1,36 @@
1
  import datetime
2
  from urllib.parse import quote
3
-
4
  import requests
 
 
5
  from lxml import etree
6
  import streamlit as st
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # =====================================================================
9
  # Namespaces
10
  # =====================================================================
@@ -64,7 +90,7 @@ def build_persistent_url(repo_code: str, object_id: str) -> str:
64
 
65
  def fetch_mods_xml(mods_url: str) -> etree._Element:
66
  """Lädt eine MODS-Datei oder OAI-PMH von einer URL und gibt das MODS Root-Element zurück."""
67
- resp = requests.get(mods_url)
68
  resp.raise_for_status()
69
  # Use recover=True to handle malformed XML (e.g. unescaped HTML in notes)
70
  parser = etree.XMLParser(recover=True, remove_blank_text=True)
@@ -443,7 +469,7 @@ class CrossrefSchemaResolver(etree.Resolver):
443
  schema_url = f"https://www.crossref.org/schemas/{url}"
444
 
445
  try:
446
- response = requests.get(schema_url, timeout=15)
447
  response.raise_for_status()
448
  return self.resolve_string(response.content, context)
449
  except Exception:
@@ -475,7 +501,7 @@ def validate_crossref_xml(xml_bytes: bytes) -> tuple[bool, list[str]]:
475
  parser.resolvers.add(CrossrefSchemaResolver())
476
 
477
  # Download main schema
478
- schema_resp = requests.get(schema_url, timeout=30)
479
  schema_resp.raise_for_status()
480
 
481
  # Parse schema with resolver
 
1
  import datetime
2
  from urllib.parse import quote
 
3
  import requests
4
+ from requests.adapters import HTTPAdapter
5
+ from urllib3.util.retry import Retry
6
  from lxml import etree
7
  import streamlit as st
8
 
9
+ # =====================================================================
10
+ # Networking Configuration
11
+ # =====================================================================
12
+
13
+ HEADERS = {
14
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
15
+ "Accept": "application/xml, text/xml, */*"
16
+ }
17
+
18
+ def get_robust_session():
19
+ """Returns a requests session with retry logic for transient errors (500, 502, 503, 504)."""
20
+ session = requests.Session()
21
+ retry_strategy = Retry(
22
+ total=5,
23
+ backoff_factor=1,
24
+ status_forcelist=[429, 500, 502, 503, 504],
25
+ allowed_methods=["GET"]
26
+ )
27
+ adapter = HTTPAdapter(max_retries=retry_strategy)
28
+ session.mount("http://", adapter)
29
+ session.mount("https://", adapter)
30
+ return session
31
+
32
+ ROBUST_SESSION = get_robust_session()
33
+
34
  # =====================================================================
35
  # Namespaces
36
  # =====================================================================
 
90
 
91
  def fetch_mods_xml(mods_url: str) -> etree._Element:
92
  """Lädt eine MODS-Datei oder OAI-PMH von einer URL und gibt das MODS Root-Element zurück."""
93
+ resp = ROBUST_SESSION.get(mods_url, headers=HEADERS, timeout=60)
94
  resp.raise_for_status()
95
  # Use recover=True to handle malformed XML (e.g. unescaped HTML in notes)
96
  parser = etree.XMLParser(recover=True, remove_blank_text=True)
 
469
  schema_url = f"https://www.crossref.org/schemas/{url}"
470
 
471
  try:
472
+ response = ROBUST_SESSION.get(schema_url, headers=HEADERS, timeout=30)
473
  response.raise_for_status()
474
  return self.resolve_string(response.content, context)
475
  except Exception:
 
501
  parser.resolvers.add(CrossrefSchemaResolver())
502
 
503
  # Download main schema
504
+ schema_resp = ROBUST_SESSION.get(schema_url, headers=HEADERS, timeout=60)
505
  schema_resp.raise_for_status()
506
 
507
  # Parse schema with resolver