Merge dev_new to main (improve retriever and add querying)

#13
by rhbt6767 - opened
This view is limited to 50 files because it contains too many changes. See the raw diff here.
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +7 -1
  3. .vscode/launch.json +25 -0
  4. Dockerfile +2 -0
  5. README.md +2 -0
  6. main.py +2 -0
  7. pyproject.toml +11 -10
  8. software/Tesseract-OCR/ambiguous_words.1.html +3 -0
  9. software/Tesseract-OCR/ambiguous_words.exe +3 -0
  10. software/Tesseract-OCR/classifier_tester.1.html +3 -0
  11. software/Tesseract-OCR/classifier_tester.exe +3 -0
  12. software/Tesseract-OCR/cntraining.1.html +3 -0
  13. software/Tesseract-OCR/cntraining.exe +3 -0
  14. software/Tesseract-OCR/combine_lang_model.1.html +3 -0
  15. software/Tesseract-OCR/combine_lang_model.exe +3 -0
  16. software/Tesseract-OCR/combine_tessdata.1.html +3 -0
  17. software/Tesseract-OCR/combine_tessdata.exe +3 -0
  18. software/Tesseract-OCR/dawg2wordlist.1.html +3 -0
  19. software/Tesseract-OCR/dawg2wordlist.exe +3 -0
  20. software/Tesseract-OCR/doc/AUTHORS +3 -0
  21. software/Tesseract-OCR/doc/LICENSE +3 -0
  22. software/Tesseract-OCR/doc/README.md +3 -0
  23. software/Tesseract-OCR/libLerc.dll +3 -0
  24. software/Tesseract-OCR/libarchive-13.dll +3 -0
  25. software/Tesseract-OCR/libb2-1.dll +3 -0
  26. software/Tesseract-OCR/libbrotlicommon.dll +3 -0
  27. software/Tesseract-OCR/libbrotlidec.dll +3 -0
  28. software/Tesseract-OCR/libbz2-1.dll +3 -0
  29. software/Tesseract-OCR/libcairo-2.dll +3 -0
  30. software/Tesseract-OCR/libcrypto-3-x64.dll +3 -0
  31. software/Tesseract-OCR/libcurl-4.dll +3 -0
  32. software/Tesseract-OCR/libdatrie-1.dll +3 -0
  33. software/Tesseract-OCR/libdeflate.dll +3 -0
  34. software/Tesseract-OCR/libexpat-1.dll +3 -0
  35. software/Tesseract-OCR/libffi-8.dll +3 -0
  36. software/Tesseract-OCR/libfontconfig-1.dll +3 -0
  37. software/Tesseract-OCR/libfreetype-6.dll +3 -0
  38. software/Tesseract-OCR/libfribidi-0.dll +3 -0
  39. software/Tesseract-OCR/libgcc_s_seh-1.dll +3 -0
  40. software/Tesseract-OCR/libgif-7.dll +3 -0
  41. software/Tesseract-OCR/libgio-2.0-0.dll +3 -0
  42. software/Tesseract-OCR/libglib-2.0-0.dll +3 -0
  43. software/Tesseract-OCR/libgmodule-2.0-0.dll +3 -0
  44. software/Tesseract-OCR/libgobject-2.0-0.dll +3 -0
  45. software/Tesseract-OCR/libgraphite2.dll +3 -0
  46. software/Tesseract-OCR/libharfbuzz-0.dll +3 -0
  47. software/Tesseract-OCR/libiconv-2.dll +3 -0
  48. software/Tesseract-OCR/libicudt74.dll +3 -0
  49. software/Tesseract-OCR/libicuin74.dll +3 -0
  50. software/Tesseract-OCR/libicuuc74.dll +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ software/** filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -26,6 +26,10 @@ test/users/user_accounts.csv
26
  .env.prd
27
  .env.example
28
 
 
 
 
 
29
  erd/
30
  playground/
31
  playground_retriever.py
@@ -33,4 +37,6 @@ playground_chat.py
33
  playground_flush_cache.py
34
  playground_create_user.py
35
  API_CONTRACT.md
36
- context_engineering/
 
 
 
26
  .env.prd
27
  .env.example
28
 
29
+ CLAUDE.md
30
+
31
+ /experiments
32
+ src/rag/experiments/
33
  erd/
34
  playground/
35
  playground_retriever.py
 
37
  playground_flush_cache.py
38
  playground_create_user.py
39
  API_CONTRACT.md
40
+ context_engineering/
41
+ sample_file/
42
+ test_tesseract.py
.vscode/launch.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "DataEyond: FastAPI (debug)",
9
+ "type": "debugpy",
10
+ "request": "launch",
11
+ "module": "uvicorn",
12
+ "args": [
13
+ "main:app",
14
+ "--host", "0.0.0.0",
15
+ "--port", "7860",
16
+ "--reload"
17
+ ],
18
+ "jinja": true,
19
+ "justMyCode": true,
20
+ "envFile": "${workspaceFolder}/.env",
21
+ "console": "integratedTerminal",
22
+ "cwd": "${workspaceFolder}"
23
+ }
24
+ ]
25
+ }
Dockerfile CHANGED
@@ -12,6 +12,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
12
  libpq-dev \
13
  gcc \
14
  libgomp1 \
 
 
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
  RUN addgroup --system app && \
 
12
  libpq-dev \
13
  gcc \
14
  libgomp1 \
15
+ tesseract-ocr \
16
+ poppler-utils \
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
  RUN addgroup --system app && \
README.md CHANGED
@@ -11,6 +11,8 @@ short_description: AI Agent core service
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
 
13
 
 
 
14
  How to run:
15
  `uv run --no-sync uvicorn main:app --host 0.0.0.0 --port 7860`
16
 
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
 
13
 
14
+ # Agentic Service Data Eyond
15
+
16
  How to run:
17
  `uv run --no-sync uvicorn main:app --host 0.0.0.0 --port 7860`
18
 
main.py CHANGED
@@ -10,6 +10,7 @@ from src.api.v1.chat import router as chat_router
10
  from src.api.v1.room import router as room_router
11
  from src.api.v1.users import router as users_router
12
  from src.api.v1.knowledge import router as knowledge_router
 
13
  from src.db.postgres.init_db import init_db
14
  import uvicorn
15
 
@@ -35,6 +36,7 @@ app.include_router(document_router)
35
  app.include_router(knowledge_router)
36
  app.include_router(room_router)
37
  app.include_router(chat_router)
 
38
 
39
 
40
  @app.on_event("startup")
 
10
  from src.api.v1.room import router as room_router
11
  from src.api.v1.users import router as users_router
12
  from src.api.v1.knowledge import router as knowledge_router
13
+ from src.api.v1.db_client import router as db_client_router
14
  from src.db.postgres.init_db import init_db
15
  import uvicorn
16
 
 
36
  app.include_router(knowledge_router)
37
  app.include_router(room_router)
38
  app.include_router(chat_router)
39
+ app.include_router(db_client_router)
40
 
41
 
42
  @app.on_event("startup")
pyproject.toml CHANGED
@@ -79,6 +79,17 @@ dependencies = [
79
  "jsonpatch>=1.33",
80
  "pymongo>=4.14.0",
81
  "psycopg2>=2.9.11",
 
 
 
 
 
 
 
 
 
 
 
82
  ]
83
 
84
  [project.optional-dependencies]
@@ -92,16 +103,6 @@ dev = [
92
  "pre-commit==4.0.1",
93
  ]
94
 
95
- [tool.uv]
96
- dev-dependencies = [
97
- "pytest==8.3.4",
98
- "pytest-asyncio==0.24.0",
99
- "pytest-cov==6.0.0",
100
- "ruff==0.8.4",
101
- "mypy==1.13.0",
102
- "pre-commit==4.0.1",
103
- ]
104
-
105
  [tool.hatch.build.targets.wheel]
106
  packages = ["src/agent_service"]
107
 
 
79
  "jsonpatch>=1.33",
80
  "pymongo>=4.14.0",
81
  "psycopg2>=2.9.11",
82
+ # --- SQL parsing / guardrails ---
83
+ "sqlglot>=25.0.0",
84
+ # --- User-DB connectors (db_pipeline) ---
85
+ "pymysql>=1.1.1",
86
+ "pymssql>=2.3.0",
87
+ "sqlalchemy-bigquery>=1.11.0",
88
+ "snowflake-sqlalchemy>=1.7.0",
89
+ # --- OCR (pdf processing) ---
90
+ "pdf2image>=1.17.0",
91
+ "pytesseract>=0.3.13",
92
+ "pypdf2>=3.0.1",
93
  ]
94
 
95
  [project.optional-dependencies]
 
103
  "pre-commit==4.0.1",
104
  ]
105
 
 
 
 
 
 
 
 
 
 
 
106
  [tool.hatch.build.targets.wheel]
107
  packages = ["src/agent_service"]
108
 
software/Tesseract-OCR/ambiguous_words.1.html ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e122cc6b60a9515b253a99321354ae2de2d5886f72ee6e88841c38f111ff86
3
+ size 17823
software/Tesseract-OCR/ambiguous_words.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85c23ac33fa8235cf5f3adaf87b97c506a56e71a96dd0f7bb83b373c029c2ae5
3
+ size 1066496
software/Tesseract-OCR/classifier_tester.1.html ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb84879ff16408961c96c40dd2640647cf74b8d779a614ef4f86e6eb8daa2b0c
3
+ size 19529
software/Tesseract-OCR/classifier_tester.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e059fdeafdc95a6a4220619226ad0d0e7a48e95fd6902c7cdd523af376d3bd74
3
+ size 4987040
software/Tesseract-OCR/cntraining.1.html ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a2adb8ab0f247b19462b83364bc41dd15004a9d5cade19c678d5be52d9bc756
3
+ size 18156
software/Tesseract-OCR/cntraining.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ea0cf4a9e750cb73bafddcc1bd14193e3dc15f2874a5e5ba4191b4662d0560e
3
+ size 4709776
software/Tesseract-OCR/combine_lang_model.1.html ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9daa4a2944dc0d3c38758670bbbb07dccc17cc2176b2b5e571ce4c99fee04654
3
+ size 21055
software/Tesseract-OCR/combine_lang_model.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddd26b40ab4c2633dade1a2678769e9c51075e552be9161d3ce652f52546697e
3
+ size 3503232
software/Tesseract-OCR/combine_tessdata.1.html ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9111e27f1333055506be7d14948b265e031aa0f834b6f68f67ed8c9a12111c60
3
+ size 27130
software/Tesseract-OCR/combine_tessdata.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f905d89cd6bd3737fa78369e6a91ce06a3f10641f56a2e8671e0d6c6168f485a
3
+ size 1281096
software/Tesseract-OCR/dawg2wordlist.1.html ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81566db3708b5a5dd581255eba17a244e51b3b14d9b6cfd82d529c87a0881cad
3
+ size 18336
software/Tesseract-OCR/dawg2wordlist.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee984cc2c0967473a2dcc361274f3e296c9b3019f8840b85446e5ca166f8a532
3
+ size 575032
software/Tesseract-OCR/doc/AUTHORS ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d69deaae7deeaa053e597906f19482aa56e3de1536851cd20600a09d888c1224
3
+ size 838
software/Tesseract-OCR/doc/LICENSE ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ddf9be5c28fe27dad143a5dc76eea25222ad1dd68934a047064e56ed2fa40c5
3
+ size 11560
software/Tesseract-OCR/doc/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7918a2fc5dd61b3ea013601f45007604574452ce4934859dc7f43e2e459c019f
3
+ size 8619
software/Tesseract-OCR/libLerc.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ab688c7c5c4afdfee0f519f942759137067a7876a1c1bb825ee575b0a5760b1
3
+ size 761261
software/Tesseract-OCR/libarchive-13.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98b9b7e9259f23eab6749ee25cca2c0643f09df6b23c58229fd43bf6249195e9
3
+ size 769154
software/Tesseract-OCR/libb2-1.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fb8418a1172042dab5227fcd466b1bf293737f5b5cf35359e851516227f109c
3
+ size 34372
software/Tesseract-OCR/libbrotlicommon.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62f363f6aac0d54ce7b115c0412b19e0fe5480f4e6fc1254f663b0ab6868e46f
3
+ size 143397
software/Tesseract-OCR/libbrotlidec.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d6c0f954b6f2db1c1c5fd5904886ae055859a02ca31b6dde5f96f6b0259d23f
3
+ size 60412
software/Tesseract-OCR/libbz2-1.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b79cd3a7102e359ef6324e98966a67939e58ec2b651ae2847cae84a4dfa453f7
3
+ size 100964
software/Tesseract-OCR/libcairo-2.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a30ed066ebbb5ed6e97da2c119d61a24e4135518c18f8cd8c61299a5182e720
3
+ size 1207571
software/Tesseract-OCR/libcrypto-3-x64.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dbc1531aa728e1e5d1b11e2927882b8ee2a12bacb2495f04f0d248d5be82cd1
3
+ size 5094016
software/Tesseract-OCR/libcurl-4.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40986750cf6a9eb36c5fcd2cff5275c4cf45e99794bbbef8c3ff5380ce051bc7
3
+ size 751728
software/Tesseract-OCR/libdatrie-1.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f760aef8a31b7c3670484ca4eabab1ab8e95c14052167e191463f6371c3f0850
3
+ size 34386
software/Tesseract-OCR/libdeflate.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6ee7b31a23605d8e8b9be5eae782a3ddf7adf2b7ca2c8e0041bdfd4fe4659ee
3
+ size 91035
software/Tesseract-OCR/libexpat-1.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9ea333175b1e995bbe6a062567f38ff362f1b2f369fa8ebdb0eeed21813bdc6
3
+ size 188909
software/Tesseract-OCR/libffi-8.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1ed51652b3746c1b4bce497b7041e677e5d80c5ab8198efebb8fc6f75329ae8
3
+ size 33708
software/Tesseract-OCR/libfontconfig-1.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6c70b7e205f24ff96b29a80849b59c84f8701b311434f64f2bf93bd36bdb163
3
+ size 350185
software/Tesseract-OCR/libfreetype-6.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5297f754dfca04abdb0d21705d5afc3c4122aa25c7afbdd87900313cb780c868
3
+ size 776690
software/Tesseract-OCR/libfribidi-0.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:252dec79246c013d8b32e4b1a034626965c55853aec32c5cafcd643b90d018db
3
+ size 150259
software/Tesseract-OCR/libgcc_s_seh-1.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0cc0027247c22e73cba10ba9e25ea2444a2b4072cb38e45c2cda7374e12a5c2
3
+ size 117427
software/Tesseract-OCR/libgif-7.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1a179853ae6fad8eb045b8e9f12c55e6fc99929446b02fa345512354cf5e582
3
+ size 41367
software/Tesseract-OCR/libgio-2.0-0.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ec4f4ddbc2384dd8a0d27c6d559bc7034b390c6bfba1ef89b755c2291cd8ef9
3
+ size 1800402
software/Tesseract-OCR/libglib-2.0-0.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:876cf35b494d95ab012e7220208501882d5f40eb7c3f421b7a92b05ff9e8c3f9
3
+ size 1448224
software/Tesseract-OCR/libgmodule-2.0-0.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41bf2686dd2b8ffb471ded05b876dea3fcb5913212b809da143adc3e9f98e5bd
3
+ size 26783
software/Tesseract-OCR/libgobject-2.0-0.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5bd93c3eb8db5e86fe8135ffd6eb246d8f9e5c15b8bc1719ca29586ccf1facd
3
+ size 358489
software/Tesseract-OCR/libgraphite2.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f927efecd958f0e54a24ae297f86cd6aed05e7fc20e5da0882050ab9ec464395
3
+ size 152538
software/Tesseract-OCR/libharfbuzz-0.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ccf58a209c175b4d8bc583e4866e1fb55414effcddebd5f8b2ea5d2feacb0af
3
+ size 1291986
software/Tesseract-OCR/libiconv-2.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:967189adfbc889fde89aafc867f7a1f02731f8592cf6fd5a4ace1929213e2e13
3
+ size 1118202
software/Tesseract-OCR/libicudt74.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a20195f28e27f65709211c8fa02af5197038f5746692c49b34f7f1bdd98931
3
+ size 30796806
software/Tesseract-OCR/libicuin74.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69328f8c98dc27ee26bba561cd9b6077a63dcfca88d601ade3bb50143a171d2e
3
+ size 2958241
software/Tesseract-OCR/libicuuc74.dll ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faea09e5599364842e832ab3f196ad2f166bada760124d10b2acc67c1c06d55c
3
+ size 1839550