LiamKhoaLe commited on
Commit
047622e
·
1 Parent(s): 0d7e5cb

Upd search engine

Browse files
Files changed (3) hide show
  1. app/main.py +7 -4
  2. app/routers/search.py +7 -3
  3. app/services/ingest.py +1 -1
app/main.py CHANGED
@@ -5,10 +5,13 @@
5
  # │ ├── routers/
6
  # │ │ ├── search.py
7
  # │ │ └── import_doc.py
8
- # │ ── services/
9
- # │ ├── google_books.py
10
- # │ ├── open_library.py
11
- # │ └── internet_archive.py
 
 
 
12
  # ├── Dockerfile
13
  # ├── docker-compose.yml
14
  # └── README.md
 
5
  # │ ├── routers/
6
  # │ │ ├── search.py
7
  # │ │ └── import_doc.py
8
+ # │ ── services/
9
+ # │├── google_books.py
10
+ # │├── open_library.py
11
+ # │└── internet_archive.py
12
+ # │ │ └── project_gutenberg.py
13
+ # │ └── health/
14
+ # │ └── check_status.py
15
  # ├── Dockerfile
16
  # ├── docker-compose.yml
17
  # └── README.md
app/routers/search.py CHANGED
@@ -17,10 +17,14 @@ def _tokenize(text: str):
17
  """lower-case & keep only alnum tokens"""
18
  return re.findall(r"[a-z0-9]+", text.lower())
19
 
 
 
 
20
  def _title_matches(title: str, query_tokens: list[str]) -> bool:
21
- title_lc = title.lower()
22
- joined = "".join(query_tokens) # “specialistmath” use-case
23
- return all(tok in title_lc for tok in query_tokens) or joined in title_lc
 
24
 
25
  @router.get("")
26
  async def search_books(q: str = Query(...)):
 
17
  """lower-case & keep only alnum tokens"""
18
  return re.findall(r"[a-z0-9]+", text.lower())
19
 
20
+ def normalize(text):
21
+ return re.sub(r'\s+', '', text.lower())
22
+
23
  def _title_matches(title: str, query_tokens: list[str]) -> bool:
24
+ """Check if all query tokens exist in title (unordered), or title contains all joined as a single word"""
25
+ title_norm = normalize(title)
26
+ joined_query = "".join(query_tokens)
27
+ return all(tok in title_norm for tok in query_tokens) or joined_query in title_norm
28
 
29
  @router.get("")
30
  async def search_books(q: str = Query(...)):
app/services/ingest.py CHANGED
@@ -1,6 +1,6 @@
1
  # app/services/ingest.py
2
  import os
3
- import fitz # PyMuPDF
4
  import io
5
  from app.db import db, grid_fs_bucket
6
  from sentence_transformers import SentenceTransformer
 
1
  # app/services/ingest.py
2
  import os
3
+ import fitz # PyMuPDF - convert PDF to plaintext for semantic embedding
4
  import io
5
  from app.db import db, grid_fs_bucket
6
  from sentence_transformers import SentenceTransformer