Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +30 -4
src/streamlit_app.py
CHANGED
|
@@ -1,10 +1,36 @@
|
|
| 1 |
import datetime
|
| 2 |
from urllib.parse import quote
|
| 3 |
-
|
| 4 |
import requests
|
|
|
|
|
|
|
| 5 |
from lxml import etree
|
| 6 |
import streamlit as st
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
# =====================================================================
|
| 9 |
# Namespaces
|
| 10 |
# =====================================================================
|
|
@@ -64,7 +90,7 @@ def build_persistent_url(repo_code: str, object_id: str) -> str:
|
|
| 64 |
|
| 65 |
def fetch_mods_xml(mods_url: str) -> etree._Element:
|
| 66 |
"""Lädt eine MODS-Datei oder OAI-PMH von einer URL und gibt das MODS Root-Element zurück."""
|
| 67 |
-
resp =
|
| 68 |
resp.raise_for_status()
|
| 69 |
# Use recover=True to handle malformed XML (e.g. unescaped HTML in notes)
|
| 70 |
parser = etree.XMLParser(recover=True, remove_blank_text=True)
|
|
@@ -443,7 +469,7 @@ class CrossrefSchemaResolver(etree.Resolver):
|
|
| 443 |
schema_url = f"https://www.crossref.org/schemas/{url}"
|
| 444 |
|
| 445 |
try:
|
| 446 |
-
response =
|
| 447 |
response.raise_for_status()
|
| 448 |
return self.resolve_string(response.content, context)
|
| 449 |
except Exception:
|
|
@@ -475,7 +501,7 @@ def validate_crossref_xml(xml_bytes: bytes) -> tuple[bool, list[str]]:
|
|
| 475 |
parser.resolvers.add(CrossrefSchemaResolver())
|
| 476 |
|
| 477 |
# Download main schema
|
| 478 |
-
schema_resp =
|
| 479 |
schema_resp.raise_for_status()
|
| 480 |
|
| 481 |
# Parse schema with resolver
|
|
|
|
| 1 |
import datetime
|
| 2 |
from urllib.parse import quote
|
|
|
|
| 3 |
import requests
|
| 4 |
+
from requests.adapters import HTTPAdapter
|
| 5 |
+
from urllib3.util.retry import Retry
|
| 6 |
from lxml import etree
|
| 7 |
import streamlit as st
|
| 8 |
|
| 9 |
+
# =====================================================================
|
| 10 |
+
# Networking Configuration
|
| 11 |
+
# =====================================================================
|
| 12 |
+
|
| 13 |
+
HEADERS = {
|
| 14 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
| 15 |
+
"Accept": "application/xml, text/xml, */*"
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
def get_robust_session():
|
| 19 |
+
"""Returns a requests session with retry logic for transient errors (500, 502, 503, 504)."""
|
| 20 |
+
session = requests.Session()
|
| 21 |
+
retry_strategy = Retry(
|
| 22 |
+
total=5,
|
| 23 |
+
backoff_factor=1,
|
| 24 |
+
status_forcelist=[429, 500, 502, 503, 504],
|
| 25 |
+
allowed_methods=["GET"]
|
| 26 |
+
)
|
| 27 |
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
| 28 |
+
session.mount("http://", adapter)
|
| 29 |
+
session.mount("https://", adapter)
|
| 30 |
+
return session
|
| 31 |
+
|
| 32 |
+
ROBUST_SESSION = get_robust_session()
|
| 33 |
+
|
| 34 |
# =====================================================================
|
| 35 |
# Namespaces
|
| 36 |
# =====================================================================
|
|
|
|
| 90 |
|
| 91 |
def fetch_mods_xml(mods_url: str) -> etree._Element:
|
| 92 |
"""Lädt eine MODS-Datei oder OAI-PMH von einer URL und gibt das MODS Root-Element zurück."""
|
| 93 |
+
resp = ROBUST_SESSION.get(mods_url, headers=HEADERS, timeout=60)
|
| 94 |
resp.raise_for_status()
|
| 95 |
# Use recover=True to handle malformed XML (e.g. unescaped HTML in notes)
|
| 96 |
parser = etree.XMLParser(recover=True, remove_blank_text=True)
|
|
|
|
| 469 |
schema_url = f"https://www.crossref.org/schemas/{url}"
|
| 470 |
|
| 471 |
try:
|
| 472 |
+
response = ROBUST_SESSION.get(schema_url, headers=HEADERS, timeout=30)
|
| 473 |
response.raise_for_status()
|
| 474 |
return self.resolve_string(response.content, context)
|
| 475 |
except Exception:
|
|
|
|
| 501 |
parser.resolvers.add(CrossrefSchemaResolver())
|
| 502 |
|
| 503 |
# Download main schema
|
| 504 |
+
schema_resp = ROBUST_SESSION.get(schema_url, headers=HEADERS, timeout=60)
|
| 505 |
schema_resp.raise_for_status()
|
| 506 |
|
| 507 |
# Parse schema with resolver
|