Spaces:
Sleeping
Sleeping
Commit ·
047622e
1
Parent(s): 0d7e5cb
Upd search engine
Browse files- app/main.py +7 -4
- app/routers/search.py +7 -3
- app/services/ingest.py +1 -1
app/main.py
CHANGED
|
@@ -5,10 +5,13 @@
|
|
| 5 |
# │ ├── routers/
|
| 6 |
# │ │ ├── search.py
|
| 7 |
# │ │ └── import_doc.py
|
| 8 |
-
# │
|
| 9 |
-
# │
|
| 10 |
-
# │
|
| 11 |
-
# │
|
|
|
|
|
|
|
|
|
|
| 12 |
# ├── Dockerfile
|
| 13 |
# ├── docker-compose.yml
|
| 14 |
# └── README.md
|
|
|
|
| 5 |
# │ ├── routers/
|
| 6 |
# │ │ ├── search.py
|
| 7 |
# │ │ └── import_doc.py
|
| 8 |
+
# │ ├── services/
|
| 9 |
+
# │ │ ├── google_books.py
|
| 10 |
+
# │ │ ├── open_library.py
|
| 11 |
+
# │ │ └── internet_archive.py
|
| 12 |
+
# │ │ └── project_gutenberg.py
|
| 13 |
+
# │ └── health/
|
| 14 |
+
# │ └── check_status.py
|
| 15 |
# ├── Dockerfile
|
| 16 |
# ├── docker-compose.yml
|
| 17 |
# └── README.md
|
app/routers/search.py
CHANGED
|
@@ -17,10 +17,14 @@ def _tokenize(text: str):
|
|
| 17 |
"""lower-case & keep only alnum tokens"""
|
| 18 |
return re.findall(r"[a-z0-9]+", text.lower())
|
| 19 |
|
|
|
|
|
|
|
|
|
|
| 20 |
def _title_matches(title: str, query_tokens: list[str]) -> bool:
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
| 24 |
|
| 25 |
@router.get("")
|
| 26 |
async def search_books(q: str = Query(...)):
|
|
|
|
| 17 |
"""lower-case & keep only alnum tokens"""
|
| 18 |
return re.findall(r"[a-z0-9]+", text.lower())
|
| 19 |
|
| 20 |
+
def normalize(text):
|
| 21 |
+
return re.sub(r'\s+', '', text.lower())
|
| 22 |
+
|
| 23 |
def _title_matches(title: str, query_tokens: list[str]) -> bool:
|
| 24 |
+
"""Check if all query tokens exist in title (unordered), or title contains all joined as a single word"""
|
| 25 |
+
title_norm = normalize(title)
|
| 26 |
+
joined_query = "".join(query_tokens)
|
| 27 |
+
return all(tok in title_norm for tok in query_tokens) or joined_query in title_norm
|
| 28 |
|
| 29 |
@router.get("")
|
| 30 |
async def search_books(q: str = Query(...)):
|
app/services/ingest.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# app/services/ingest.py
|
| 2 |
import os
|
| 3 |
-
import fitz # PyMuPDF
|
| 4 |
import io
|
| 5 |
from app.db import db, grid_fs_bucket
|
| 6 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 1 |
# app/services/ingest.py
|
| 2 |
import os
|
| 3 |
+
import fitz # PyMuPDF - convert PDF to plaintext for semantic embedding
|
| 4 |
import io
|
| 5 |
from app.db import db, grid_fs_bucket
|
| 6 |
from sentence_transformers import SentenceTransformer
|