Upload utils.py
Browse files
utils.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils.py
|
| 2 |
+
from typing import Dict, Set
|
| 3 |
+
|
| 4 |
+
def text_blob(b: Dict) -> str:
|
| 5 |
+
parts = [
|
| 6 |
+
b.get("title", ""),
|
| 7 |
+
b.get("authors", ""),
|
| 8 |
+
b.get("categories", ""),
|
| 9 |
+
b.get("subjects", ""),
|
| 10 |
+
b.get("description", "")
|
| 11 |
+
]
|
| 12 |
+
return " | ".join(p for p in parts if p)
|
| 13 |
+
|
| 14 |
+
def _to_tagset(s: str) -> Set[str]:
|
| 15 |
+
return {t.strip().lower() for t in (s or "").split(";") if t.strip()}
|
| 16 |
+
|
| 17 |
+
def jaccard(a: str, b: str) -> float:
|
| 18 |
+
A, B = _to_tagset(a), _to_tagset(b)
|
| 19 |
+
if not A or not B:
|
| 20 |
+
return 0.0
|
| 21 |
+
return len(A & B) / len(A | B)
|