LiamKhoaLe commited on
Commit
0d7e5cb
·
1 Parent(s): d443075

Upd pg search method with hardcode url technique

Browse files
app/.DS_Store CHANGED
Binary files a/app/.DS_Store and b/app/.DS_Store differ
 
app/services/project_gutenberg.py CHANGED
@@ -18,14 +18,26 @@ async def search(q: str):
18
 
19
  results = []
20
  for b in books:
21
- # Find a PDF format, ignore non-PDF rows
22
  pdf_link = next(
23
- (v for k, v in b["formats"].items() if k.endswith("pdf")), None
24
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  if not pdf_link:
26
  logger.debug(f"[GUT] skipped (no PDF): {b['title']}")
27
  continue
28
-
29
  results.append(
30
  {
31
  "title": b["title"],
 
18
 
19
  results = []
20
  for b in books:
 
21
  pdf_link = next(
22
+ (v for k, v in b["formats"].items() if k.lower().endswith("pdf")), None
23
  )
24
+ # Link not from public details
25
+ if not pdf_link:
26
+ try:
27
+ # Attempt fallback hardcoded PDF URL
28
+ fallback_url = f"https://www.gutenberg.org/files/{b['id']}/{b['id']}-pdf.pdf"
29
+ async with httpx.AsyncClient(timeout=5) as client:
30
+ head_resp = await client.head(fallback_url)
31
+ if head_resp.status_code == 200:
32
+ pdf_link = fallback_url
33
+ # PDF not accessible from
34
+ except Exception as e:
35
+ logger.debug(f"[GUT] fallback failed for {b['id']}: {e}")
36
+ # Fallback book not having preview/download url from both details and hardcode method
37
  if not pdf_link:
38
  logger.debug(f"[GUT] skipped (no PDF): {b['title']}")
39
  continue
40
+ # Final JSON
41
  results.append(
42
  {
43
  "title": b["title"],
test/db_debug.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Quick CLI helper to peek at MongoDB + GridFS.
3
+ Usage:
4
+ python test/db_debug.py
5
+ or python test/db_debug.py --doc <document_id>
6
+ """
7
+
8
+ import argparse, os, json, pprint
9
+ from pymongo import MongoClient
10
+ from gridfs import GridFSBucket
11
+
12
+ MONGO_URI = os.getenv("MONGODB_URI")
13
+ MONGO_DB_NAME = os.getenv("MONGODB_DB", "querysearcher")
14
+
15
+ client = MongoClient(MONGO_URI)
16
+ db = client[MONGO_DB_NAME]
17
+ fs = GridFSBucket(db)
18
+
19
+ def list_documents():
20
+ for doc in db.documents.find():
21
+ print(f"{doc['_id']} | {doc['title']:<60} | {doc['status']}")
22
+
23
+ def doc_detail(doc_id):
24
+ doc = db.documents.find_one({"_id": doc_id})
25
+ if not doc:
26
+ print("No such doc.")
27
+ return
28
+ pprint.pp(doc)
29
+ # print how many embeddings
30
+ cnt = db.embeddings.count_documents({"document_id": doc_id})
31
+ print(f"Embeddings chunks: {cnt}")
32
+ # confirm pdf file exists
33
+ try:
34
+ fs.open_download_stream_by_name(f"{doc_id}.pdf").close()
35
+ print("✅ PDF present in GridFS")
36
+ except:
37
+ print("⚠️ PDF missing in GridFS")
38
+
39
+ if __name__ == "__main__":
40
+ parser = argparse.ArgumentParser()
41
+ parser.add_argument("--doc", help="document_id to inspect")
42
+ args = parser.parse_args()
43
+ if args.doc:
44
+ doc_detail(args.doc)
45
+ else:
46
+ list_documents()