Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +12 -35
- .gitignore +174 -0
- .gradio/certificate.pem +31 -0
- README.md +108 -0
- chatbot_complete_traces.py +9 -0
- chroma_db/26626544-3216-4217-8b2e-7c0e117a4053/data_level0.bin +3 -0
- chroma_db/26626544-3216-4217-8b2e-7c0e117a4053/header.bin +3 -0
- chroma_db/26626544-3216-4217-8b2e-7c0e117a4053/index_metadata.pickle +3 -0
- chroma_db/26626544-3216-4217-8b2e-7c0e117a4053/length.bin +3 -0
- chroma_db/26626544-3216-4217-8b2e-7c0e117a4053/link_lists.bin +3 -0
- chroma_db/397565d9-d9aa-4de8-922a-883a4693e98e/data_level0.bin +3 -0
- chroma_db/397565d9-d9aa-4de8-922a-883a4693e98e/header.bin +3 -0
- chroma_db/397565d9-d9aa-4de8-922a-883a4693e98e/index_metadata.pickle +3 -0
- chroma_db/397565d9-d9aa-4de8-922a-883a4693e98e/length.bin +3 -0
- chroma_db/397565d9-d9aa-4de8-922a-883a4693e98e/link_lists.bin +3 -0
- chroma_db/67c0890a-3d55-4f87-860a-6f0f45f80f1f/data_level0.bin +3 -0
- chroma_db/67c0890a-3d55-4f87-860a-6f0f45f80f1f/header.bin +3 -0
- chroma_db/67c0890a-3d55-4f87-860a-6f0f45f80f1f/index_metadata.pickle +3 -0
- chroma_db/67c0890a-3d55-4f87-860a-6f0f45f80f1f/length.bin +3 -0
- chroma_db/67c0890a-3d55-4f87-860a-6f0f45f80f1f/link_lists.bin +3 -0
- chroma_db/chroma.sqlite3 +3 -0
- chroma_db/db8cde4c-745c-45d8-a508-1266a9927646/data_level0.bin +3 -0
- chroma_db/db8cde4c-745c-45d8-a508-1266a9927646/header.bin +3 -0
- chroma_db/db8cde4c-745c-45d8-a508-1266a9927646/index_metadata.pickle +3 -0
- chroma_db/db8cde4c-745c-45d8-a508-1266a9927646/length.bin +3 -0
- chroma_db/db8cde4c-745c-45d8-a508-1266a9927646/link_lists.bin +3 -0
- chroma_db/e83b0840-8a7f-427a-a846-bc626ef433d9/data_level0.bin +3 -0
- chroma_db/e83b0840-8a7f-427a-a846-bc626ef433d9/header.bin +3 -0
- chroma_db/e83b0840-8a7f-427a-a846-bc626ef433d9/length.bin +3 -0
- chroma_db/e83b0840-8a7f-427a-a846-bc626ef433d9/link_lists.bin +0 -0
- chroma_db/eca6771a-e870-49de-bf11-9f330748609c/data_level0.bin +3 -0
- chroma_db/eca6771a-e870-49de-bf11-9f330748609c/header.bin +3 -0
- chroma_db/eca6771a-e870-49de-bf11-9f330748609c/index_metadata.pickle +3 -0
- chroma_db/eca6771a-e870-49de-bf11-9f330748609c/length.bin +3 -0
- chroma_db/eca6771a-e870-49de-bf11-9f330748609c/link_lists.bin +3 -0
- constants.py +48 -0
- create_chromadb.ipynb +1139 -0
- dataset/README.md +33 -0
- demo.py +220 -0
- doc_renta/Renta_2023_doc_int.md +0 -0
- doc_renta/Renta_2023_doc_int_corrected.md +0 -0
- doc_renta/Renta_2023_doc_int_corrected_2.md +0 -0
- doc_renta/indice.md +0 -0
- doc_renta/parser.ipynb +361 -0
- docs/images/AgenticRAG.png +0 -0
- docs/images/naive_rag.jpg +0 -0
- papers/A_Survey_of_Large_Language_Models.pdf +3 -0
- papers/BERT.pdf +3 -0
- papers/Language_Models_are_Few_Shot_Learners.pdf +3 -0
- papers/Large_Language_Models_Survey.pdf +3 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,12 @@
|
|
| 1 |
-
|
| 2 |
-
*.
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
chroma_db/** filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.sqlite filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
papers/A_Survey_of_Large_Language_Models.pdf filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
papers/attention_is_all_you_need.pdf filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
papers/BERT.pdf filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
papers/efficient_LLM_survey.pdf filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
papers/finetuned_language_models_are_zero_shot_learners.pdf filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
papers/Language_Models_are_Few_Shot_Learners.pdf filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
papers/Large_Language_Models_Survey.pdf filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
papers/llama.pdf filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
papers/ManualRenta2023_es_es.pdf filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
papers/More_Agents_Is_All_You_Need.pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#chroma_db/
|
| 2 |
+
#*.sqlite3
|
| 3 |
+
|
| 4 |
+
# Byte-compiled / optimized / DLL files
|
| 5 |
+
__pycache__/
|
| 6 |
+
*.py[cod]
|
| 7 |
+
*$py.class
|
| 8 |
+
|
| 9 |
+
# C extensions
|
| 10 |
+
*.so
|
| 11 |
+
|
| 12 |
+
# Distribution / packaging
|
| 13 |
+
.Python
|
| 14 |
+
build/
|
| 15 |
+
develop-eggs/
|
| 16 |
+
dist/
|
| 17 |
+
downloads/
|
| 18 |
+
eggs/
|
| 19 |
+
.eggs/
|
| 20 |
+
lib/
|
| 21 |
+
lib64/
|
| 22 |
+
parts/
|
| 23 |
+
sdist/
|
| 24 |
+
var/
|
| 25 |
+
wheels/
|
| 26 |
+
share/python-wheels/
|
| 27 |
+
*.egg-info/
|
| 28 |
+
.installed.cfg
|
| 29 |
+
*.egg
|
| 30 |
+
MANIFEST
|
| 31 |
+
|
| 32 |
+
# PyInstaller
|
| 33 |
+
# Usually these files are written by a python script from a template
|
| 34 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 35 |
+
*.manifest
|
| 36 |
+
*.spec
|
| 37 |
+
|
| 38 |
+
# Installer logs
|
| 39 |
+
pip-log.txt
|
| 40 |
+
pip-delete-this-directory.txt
|
| 41 |
+
|
| 42 |
+
# Unit test / coverage reports
|
| 43 |
+
htmlcov/
|
| 44 |
+
.tox/
|
| 45 |
+
.nox/
|
| 46 |
+
.coverage
|
| 47 |
+
.coverage.*
|
| 48 |
+
.cache
|
| 49 |
+
nosetests.xml
|
| 50 |
+
coverage.xml
|
| 51 |
+
*.cover
|
| 52 |
+
*.py,cover
|
| 53 |
+
.hypothesis/
|
| 54 |
+
.pytest_cache/
|
| 55 |
+
cover/
|
| 56 |
+
|
| 57 |
+
# Translations
|
| 58 |
+
*.mo
|
| 59 |
+
*.pot
|
| 60 |
+
|
| 61 |
+
# Django stuff:
|
| 62 |
+
*.log
|
| 63 |
+
local_settings.py
|
| 64 |
+
db.sqlite3
|
| 65 |
+
db.sqlite3-journal
|
| 66 |
+
|
| 67 |
+
# Flask stuff:
|
| 68 |
+
instance/
|
| 69 |
+
.webassets-cache
|
| 70 |
+
|
| 71 |
+
# Scrapy stuff:
|
| 72 |
+
.scrapy
|
| 73 |
+
|
| 74 |
+
# Sphinx documentation
|
| 75 |
+
docs/_build/
|
| 76 |
+
|
| 77 |
+
# PyBuilder
|
| 78 |
+
.pybuilder/
|
| 79 |
+
target/
|
| 80 |
+
|
| 81 |
+
# Jupyter Notebook
|
| 82 |
+
.ipynb_checkpoints
|
| 83 |
+
|
| 84 |
+
# IPython
|
| 85 |
+
profile_default/
|
| 86 |
+
ipython_config.py
|
| 87 |
+
|
| 88 |
+
# pyenv
|
| 89 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 90 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 91 |
+
# .python-version
|
| 92 |
+
|
| 93 |
+
# pipenv
|
| 94 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 95 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 96 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 97 |
+
# install all needed dependencies.
|
| 98 |
+
#Pipfile.lock
|
| 99 |
+
|
| 100 |
+
# UV
|
| 101 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 102 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 103 |
+
# commonly ignored for libraries.
|
| 104 |
+
#uv.lock
|
| 105 |
+
|
| 106 |
+
# poetry
|
| 107 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 108 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 109 |
+
# commonly ignored for libraries.
|
| 110 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 111 |
+
#poetry.lock
|
| 112 |
+
|
| 113 |
+
# pdm
|
| 114 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 115 |
+
#pdm.lock
|
| 116 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 117 |
+
# in version control.
|
| 118 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 119 |
+
.pdm.toml
|
| 120 |
+
.pdm-python
|
| 121 |
+
.pdm-build/
|
| 122 |
+
|
| 123 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 124 |
+
__pypackages__/
|
| 125 |
+
|
| 126 |
+
# Celery stuff
|
| 127 |
+
celerybeat-schedule
|
| 128 |
+
celerybeat.pid
|
| 129 |
+
|
| 130 |
+
# SageMath parsed files
|
| 131 |
+
*.sage.py
|
| 132 |
+
|
| 133 |
+
# Environments
|
| 134 |
+
.env
|
| 135 |
+
.venv
|
| 136 |
+
env/
|
| 137 |
+
venv/
|
| 138 |
+
ENV/
|
| 139 |
+
env.bak/
|
| 140 |
+
venv.bak/
|
| 141 |
+
|
| 142 |
+
# Spyder project settings
|
| 143 |
+
.spyderproject
|
| 144 |
+
.spyproject
|
| 145 |
+
|
| 146 |
+
# Rope project settings
|
| 147 |
+
.ropeproject
|
| 148 |
+
|
| 149 |
+
# mkdocs documentation
|
| 150 |
+
/site
|
| 151 |
+
|
| 152 |
+
# mypy
|
| 153 |
+
.mypy_cache/
|
| 154 |
+
.dmypy.json
|
| 155 |
+
dmypy.json
|
| 156 |
+
|
| 157 |
+
# Pyre type checker
|
| 158 |
+
.pyre/
|
| 159 |
+
|
| 160 |
+
# pytype static type analyzer
|
| 161 |
+
.pytype/
|
| 162 |
+
|
| 163 |
+
# Cython debug symbols
|
| 164 |
+
cython_debug/
|
| 165 |
+
|
| 166 |
+
# PyCharm
|
| 167 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 168 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 169 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 170 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 171 |
+
#.idea/
|
| 172 |
+
|
| 173 |
+
# PyPI configuration file
|
| 174 |
+
.pypirc
|
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
README.md
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Agentic RAG
|
| 2 |
+
|
| 3 |
+
## Theoretical review
|
| 4 |
+
### What is a **Retrieval Augmented Generation (RAG)**?
|
| 5 |
+
|
| 6 |
+
Published by Meta AI in NeurIPS 2020 -> [Paper](https://research.facebook.com/publications/retrieval-augmented-generation-for-knowledge-intensive-nlp-tasks/).
|
| 7 |
+
|
| 8 |
+
**1. Overview**:
|
| 9 |
+
- **Hybrid Model**: Combines the strengths of retrieval-based and generation-based models.
|
| 10 |
+
- **Purpose**: Enhances LLM text generation by incorporating relevant external information, improving accuracy and context.
|
| 11 |
+
|
| 12 |
+
**2. Structure**:
|
| 13 |
+
- **Retriever**:
|
| 14 |
+
- Function: Searches a large corpus (e.g., documents, articles) to find relevant information based on the input query.
|
| 15 |
+
- Techniques: Uses methods like BM25, dense retrieval, or neural retrievers.
|
| 16 |
+
- **Generator**:
|
| 17 |
+
- Function: Generates a response using the information retrieved.
|
| 18 |
+
- Model: Typically a transformer-based model like GPT-4 or Llama.
|
| 19 |
+
|
| 20 |
+
**3. Workflow**:
|
| 21 |
+
1. **Input Query**: User provides a query or prompt.
|
| 22 |
+
2. **Document Retrieval**:
|
| 23 |
+
- The retriever fetches a set of relevant documents or passages.
|
| 24 |
+
- These documents provide context and factual information.
|
| 25 |
+
3. **Response Generation**:
|
| 26 |
+
- The generator uses the retrieved documents to produce a coherent and contextually accurate response.
|
| 27 |
+
- Ensures the generated text is informed by the most relevant information available.
|
| 28 |
+
|
| 29 |
+
**4. Benefits**:
|
| 30 |
+
1. **Enhanced Accuracy**: By grounding responses in real-world data, RAG models significantly improve the accuracy of generated content.
|
| 31 |
+
2. **Reduced Hallucinations**: The integration of external knowledge helps mitigate the risk of generating incorrect or nonsensical responses.
|
| 32 |
+
3. **Scalability**: RAG systems can handle vast amounts of data, making them suitable for enterprise-level applications.
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
#### **Pipeline**
|
| 36 |
+

|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
---
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
### What is a **Agentic-RAG**?
|
| 43 |
+
**1. Overview**:
|
| 44 |
+
- **Enhanced RAG**: Extends RAG by adding agent-like capabilities.
|
| 45 |
+
- **Purpose**: Designed to perform tasks autonomously, interacting with various tools and APIs to achieve specific goals.
|
| 46 |
+
|
| 47 |
+
**2. Structure**:
|
| 48 |
+
- **Retriever**:
|
| 49 |
+
- Function: Similar to RAG, it fetches relevant documents based on the input query.
|
| 50 |
+
- **Generator**:
|
| 51 |
+
- Function: Generates an initial response using the retrieved documents.
|
| 52 |
+
- **Agent Module**:
|
| 53 |
+
- Function: Evaluates the generated response, cross-references it with the knowledge base, and makes corrections if discrepancies are found.
|
| 54 |
+
|
| 55 |
+
**3. Workflow**:
|
| 56 |
+
1. **Input Query**: User provides a query/question/task.
|
| 57 |
+
2. **Document Retrieval**: The retriever fetches relevant documents to provide context.
|
| 58 |
+
3. **Initial Response Generation**: The generator creates a preliminary answer using the retrieved information.
|
| 59 |
+
4. **Response Verification**: The agent system assesses the initial response against the knowledge base to ensure accuracy.
|
| 60 |
+
5. **Response Correction (if needed)**: If inaccuracies are detected, the agent system refines the response to align with verified information.
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
**4. Benefits**:
|
| 64 |
+
1. **Improved Reliability**: The agent system's verification process ensures responses are accurate and trustworthy.
|
| 65 |
+
2. **Dynamic Correction**: Enables real-time adjustments to responses, enhancing the system's adaptability to new information.
|
| 66 |
+
3. **User Trust**: By providing verified answers, the system builds greater user confidence in its outputs.
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
#### **Pipeline**
|
| 70 |
+

|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
## Frameworks recommended for Agents
|
| 75 |
+
[LangChain](https://python.langchain.com/docs/introduction/)
|
| 76 |
+
[LangGraph](https://langchain-ai.github.io/langgraph/)
|
| 77 |
+
[AutoGen](https://microsoft.github.io/autogen/0.2/docs/Getting-Started)
|
| 78 |
+
[SmolAgent](https://github.com/huggingface/smolagents)
|
| 79 |
+
[PydanticAI](https://ai.pydantic.dev/)
|
| 80 |
+
[Vector Database: Chroma](https://docs.trychroma.com/)
|
| 81 |
+
|
| 82 |
+
## Frameworks recommended to develop user interfaces.
|
| 83 |
+
[Streamlit](https://docs.streamlit.io/)
|
| 84 |
+
[Gradio](https://www.gradio.app/docs/python-client/introduction)
|
| 85 |
+
[Chainlit](https://docs.chainlit.io/get-started/overview)
|
| 86 |
+
|
| 87 |
+
---
|
| 88 |
+
|
| 89 |
+
## Code Examples
|
| 90 |
+
- [LangGraph Agentic RAG](https://langchain-ai.github.io/langgraph/tutorials/rag/langgraph_agentic_rag/)
|
| 91 |
+
- [SmolAgent](https://github.com/huggingface/smolagents/tree/main/examples)
|
| 92 |
+
- [LangGraph ai agent for engineers](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/workshops/ai-agents/ai_agents_for_engineers.ipynb)
|
| 93 |
+
- [Agentic RAG with LangChain:](https://medium.com/@jagadeesan.ganesh/agentic-rag-with-langchain-revolutionizing-ai-with-dynamic-decision-making-ff1dee6df4ca)
|
| 94 |
+
|
| 95 |
+
## References
|
| 96 |
+
|
| 97 |
+
1. [Video: What is Agentic RAG?](https://www.youtube.com/watch?v=0z9_MhcYvcY)
|
| 98 |
+
2. [Video: LangChain vs LangGraph](https://www.youtube.com/watch?v=qAF1NjEVHhY)
|
| 99 |
+
3. [Video: Build Your Own AI Agent System from scratch!](https://www.youtube.com/watch?v=LzG_Vkd30Kg)
|
| 100 |
+
4. [Course: AI Agents in LangGraph](https://www.deeplearning.ai/short-courses/ai-agents-in-langgraph/)
|
| 101 |
+
5. [Course: Advanced Retrieval for AI with Chroma](https://www.deeplearning.ai/short-courses/advanced-retrieval-for-ai/)
|
| 102 |
+
6. [Course: AI Agents in LangGraph](https://www.deeplearning.ai/short-courses/ai-agents-in-langgraph/)
|
| 103 |
+
7. [A Comprehensive Guide to Building Agentic RAG Systems with LangGraph](https://www.analyticsvidhya.com/blog/2024/07/building-agentic-rag-systems-with-langgraph/)
|
| 104 |
+
8. [leewayhertz: Agentic RAG](https://www.leewayhertz.com/agentic-rag/)
|
| 105 |
+
9. [Vectorize: How I finally got agentic RAG to work right](https://vectorize.io/how-i-finally-got-agentic-rag-to-work-right/)
|
| 106 |
+
10. [Simple Agentic RAG for Multi Vector stores with LangChain and LangGraph](https://www.metadocs.co/2024/08/20/simple-agentic-rag-for-multi-vector-stores-with-langchain-and-langgraph/)
|
| 107 |
+
|
| 108 |
+
---
|
chatbot_complete_traces.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from rag_smolagent import SmolAgent
|
| 4 |
+
from smolagents import GradioUI
|
| 5 |
+
|
| 6 |
+
agent = SmolAgent()
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
GradioUI(agent.agent, file_upload_folder='./test').launch()
|
chroma_db/26626544-3216-4217-8b2e-7c0e117a4053/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5e2c0bf2e5c814e041b88b59284f7c9ca5f8eb796b25e148edc64a562c2e7d6
|
| 3 |
+
size 99424000
|
chroma_db/26626544-3216-4217-8b2e-7c0e117a4053/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73980fa279463b4996b54533e1212ac256cec5e47724f8d7fcb5afa8cc787de9
|
| 3 |
+
size 100
|
chroma_db/26626544-3216-4217-8b2e-7c0e117a4053/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b027bb4a67df57d1fc8a70da25cae105bc18f7ca260651357bd37e31d2cdf193
|
| 3 |
+
size 462312
|
chroma_db/26626544-3216-4217-8b2e-7c0e117a4053/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b46396124c9835de057437790129fd2728234a2055855368d2fdf8e67f39448f
|
| 3 |
+
size 32000
|
chroma_db/26626544-3216-4217-8b2e-7c0e117a4053/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:446e93dc48161698cd4667c6093ff09717ddc22319aa2f85f7195e2eabbe2977
|
| 3 |
+
size 69128
|
chroma_db/397565d9-d9aa-4de8-922a-883a4693e98e/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d5591b3412b2a6b18cb493e0a5d3e5c12a9206e9de6b8123b80de91515195906
|
| 3 |
+
size 12428000
|
chroma_db/397565d9-d9aa-4de8-922a-883a4693e98e/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88931a1cba2ba384615dce8363769649fe85af4facd17312c439923242f9e8c8
|
| 3 |
+
size 100
|
chroma_db/397565d9-d9aa-4de8-922a-883a4693e98e/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e950ee5229060ad37a7e99c1e0cb87b9f529b8b7664f46a115aab634d3fc5186
|
| 3 |
+
size 56207
|
chroma_db/397565d9-d9aa-4de8-922a-883a4693e98e/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c6938e7dc4f002b7e46e94c470a13a840040adb2dca0be07e0ebe80d0e13663
|
| 3 |
+
size 4000
|
chroma_db/397565d9-d9aa-4de8-922a-883a4693e98e/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f68764a92b7e26f3672a20ee2d351fe55ddc87831327db9f3a85ebb773a08e29
|
| 3 |
+
size 8420
|
chroma_db/67c0890a-3d55-4f87-860a-6f0f45f80f1f/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d88dc53e8488fd838b8ae4185f4b3340e85eeb6fa0b83af2438171a790e2e366
|
| 3 |
+
size 12428000
|
chroma_db/67c0890a-3d55-4f87-860a-6f0f45f80f1f/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88931a1cba2ba384615dce8363769649fe85af4facd17312c439923242f9e8c8
|
| 3 |
+
size 100
|
chroma_db/67c0890a-3d55-4f87-860a-6f0f45f80f1f/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e63c81fe1c420d01813dc382ce3e146dda28ad8e3ab2aa29e9102f77798046a6
|
| 3 |
+
size 56207
|
chroma_db/67c0890a-3d55-4f87-860a-6f0f45f80f1f/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:224cd3fafdabcb569a381a46586f0ca7a4738d5acbcc56f907f722f1623d0c95
|
| 3 |
+
size 4000
|
chroma_db/67c0890a-3d55-4f87-860a-6f0f45f80f1f/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e78edb5571076b0e9d517275a1537d5d7153e2917fc50d9576f2619fec1df553
|
| 3 |
+
size 8420
|
chroma_db/chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d1abd73d2e89b6917268826c874f8b9d915ae265f403a365a16562760c4d68f
|
| 3 |
+
size 205418496
|
chroma_db/db8cde4c-745c-45d8-a508-1266a9927646/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64e66a169d5e6bd7f48d48f66a7c8f80e01c7b7f3d21a4f26b54320df4aca40e
|
| 3 |
+
size 74568000
|
chroma_db/db8cde4c-745c-45d8-a508-1266a9927646/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58691bb737546a2e2d105899caae7c981676b46ce0c8ca79dc330854b9029832
|
| 3 |
+
size 100
|
chroma_db/db8cde4c-745c-45d8-a508-1266a9927646/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ec2a6461d03f3f03b6f12b29c03196b1bbe7f23a90579650f014619b78202f0
|
| 3 |
+
size 346282
|
chroma_db/db8cde4c-745c-45d8-a508-1266a9927646/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:151ff79f29e96d211576b9a2e3e78f518b26109916616945d50cdee82dd2ba8b
|
| 3 |
+
size 24000
|
chroma_db/db8cde4c-745c-45d8-a508-1266a9927646/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0719792cef0762348a92b58c3b74d730e21607d3212a09d95eb94ef6b94e526c
|
| 3 |
+
size 51880
|
chroma_db/e83b0840-8a7f-427a-a846-bc626ef433d9/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b8d4b3825a7c7a773e22fa3eeef0e7d15a695f5c4183aeff5beb07741a68679
|
| 3 |
+
size 12428000
|
chroma_db/e83b0840-8a7f-427a-a846-bc626ef433d9/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e8a3ec48846fc6fdfaef19f5ed2508f0bf3da4a3c93b0f6b3dd21f0a22ec1026
|
| 3 |
+
size 100
|
chroma_db/e83b0840-8a7f-427a-a846-bc626ef433d9/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16532ffc7a8c1e2ad4c26974dc1f44008c1562ddec81aeedc899d4118fe6b457
|
| 3 |
+
size 4000
|
chroma_db/e83b0840-8a7f-427a-a846-bc626ef433d9/link_lists.bin
ADDED
|
File without changes
|
chroma_db/eca6771a-e870-49de-bf11-9f330748609c/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5afe5636d8a1a0bffa5e271c03120dcdf140c235e06aef48a4ce59106ad0bcdc
|
| 3 |
+
size 24856000
|
chroma_db/eca6771a-e870-49de-bf11-9f330748609c/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59852b48932154864a3b32b3bba3dbdfd054a1fe2588aeec39257ef4c019e457
|
| 3 |
+
size 100
|
chroma_db/eca6771a-e870-49de-bf11-9f330748609c/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:995b4f67c08be129529efb4d5812d464d72ad2dbabb7f2aa66abad7be39fe133
|
| 3 |
+
size 113967
|
chroma_db/eca6771a-e870-49de-bf11-9f330748609c/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f49446ec6c7ce2fcd02745bbd5ffd6d1f57f814276bbe41160e3377245cc5928
|
| 3 |
+
size 8000
|
chroma_db/eca6771a-e870-49de-bf11-9f330748609c/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6f4c10d6a3aa1930668fcb9f974850263254a6ebf6535ebcb9618c343bc8284
|
| 3 |
+
size 16976
|
constants.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PROMPT_TEMPLATE = """
|
| 2 |
+
Eres un gestor profesional y tu tarea consiste en dar una respuesta a la consulta del usuario. Tienes acceso a un sistema de Retrieval-Augmented Generation (RAG), que puedes consultar para encontrar el contexto relevante en estos documentos.
|
| 3 |
+
Para responder a la consulta, sigue estos pasos:
|
| 4 |
+
|
| 5 |
+
1. En primer lugar, debes comprobar si la consulta es puramente conversacional y no necesitas acceder a ninguna base de datos para responderla.
|
| 6 |
+
Si es así, procede a responderla de un modo profesional, la respuesta debe seguir el siguiente formato:
|
| 7 |
+
|
| 8 |
+
Respuesta:
|
| 9 |
+
<su respuesta>
|
| 10 |
+
|
| 11 |
+
2. Si la consulta del usuario *requiere* que recuperes información de la base de datos:
|
| 12 |
+
- Responde después de obtener toda la información necesaria (puedes hacer todas las llamadas que consideres).
|
| 13 |
+
- A continuación, indica las fuentes que hayas utilizado. Para ello debes incluir el contenido de los headers que se incluyen en los metadatos en el apartado "Fuentes:".
|
| 14 |
+
- Por ejemplo, si el sistema RAG devuelve:
|
| 15 |
+
===== Document {{'Header 1': 'Guía de las principales novedades del IRPF en el ejercicio 2023', 'Header 2': 'Resultado de la declaración', 'page': '29'}} =====
|
| 16 |
+
===== Document {{'Header 1': 'Capítulo 4. Rendimientos del capital inmobiliario', 'Header 2': 'Rendimiento mínimo computable en caso de parentesco', 'page': '284'}} =====
|
| 17 |
+
El output debe seguir el siguiente formato:
|
| 18 |
+
|
| 19 |
+
Respuesta:
|
| 20 |
+
<Su respuesta aquí>
|
| 21 |
+
|
| 22 |
+
Fuentes:
|
| 23 |
+
<Header 1>, <Header 2>, ..., <Header n>, <page>
|
| 24 |
+
<Header 1>, <Header 2>, ..., <Header n>, <page>
|
| 25 |
+
...
|
| 26 |
+
|
| 27 |
+
3. - Make sure to include code with the correct pattern, for instance:
|
| 28 |
+
Thoughts: Your thoughts
|
| 29 |
+
Code:
|
| 30 |
+
```py
|
| 31 |
+
# Your python code here
|
| 32 |
+
```<end_code>
|
| 33 |
+
Make sure to provide correct code blobs, with always a though and code.
|
| 34 |
+
|
| 35 |
+
- Una vez obtenido el output (el cual contiene 'Respuesta' y 'Fuentes'), llama a final_answer(output).
|
| 36 |
+
Code:
|
| 37 |
+
```py
|
| 38 |
+
final_answer("YOUR FINAL ANSWER HERE")
|
| 39 |
+
```<end_code>
|
| 40 |
+
|
| 41 |
+
4. Hay dos formatos de respuesta:
|
| 42 |
+
- Si es 'Concise' la respuesta debe responder completa pero únicamente a la consulta.
|
| 43 |
+
- Si el formato es 'Detailed' la respuesta tiene que ser extensa y detallada, dando información adicional que pueda ser relevante para el usuario y explicando conceptos si es necesario.
|
| 44 |
+
El usuario desea que la respuesta sea '{response_type}'
|
| 45 |
+
|
| 46 |
+
La query del usuario es:
|
| 47 |
+
{extra_info} {query}
|
| 48 |
+
"""
|
create_chromadb.ipynb
ADDED
|
@@ -0,0 +1,1139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"name": "stdout",
|
| 10 |
+
"output_type": "stream",
|
| 11 |
+
"text": [
|
| 12 |
+
"Requirement already satisfied: datasets in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from -r requirements.txt (line 1)) (3.2.0)\n",
|
| 13 |
+
"Requirement already satisfied: transformers in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from -r requirements.txt (line 2)) (4.48.1)\n",
|
| 14 |
+
"Requirement already satisfied: langchain in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from -r requirements.txt (line 3)) (0.3.15)\n",
|
| 15 |
+
"Requirement already satisfied: langchain-community in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from -r requirements.txt (line 4)) (0.3.15)\n",
|
| 16 |
+
"Requirement already satisfied: smolagents in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from -r requirements.txt (line 5)) (1.8.1)\n",
|
| 17 |
+
"Requirement already satisfied: rank_bm25 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from -r requirements.txt (line 6)) (0.2.2)\n",
|
| 18 |
+
"Requirement already satisfied: litellm in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from -r requirements.txt (line 7)) (1.59.5)\n",
|
| 19 |
+
"Requirement already satisfied: chromadb in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from -r requirements.txt (line 8)) (0.6.3)\n",
|
| 20 |
+
"Requirement already satisfied: reportlab in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from -r requirements.txt (line 9)) (4.2.5)\n",
|
| 21 |
+
"Requirement already satisfied: gradio in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from -r requirements.txt (line 10)) (5.16.1)\n",
|
| 22 |
+
"Requirement already satisfied: sentence-transformers in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from -r requirements.txt (line 11)) (3.4.0)\n",
|
| 23 |
+
"Requirement already satisfied: filelock in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (3.17.0)\n",
|
| 24 |
+
"Requirement already satisfied: numpy>=1.17 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (1.26.4)\n",
|
| 25 |
+
"Requirement already satisfied: pyarrow>=15.0.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (19.0.0)\n",
|
| 26 |
+
"Requirement already satisfied: dill<0.3.9,>=0.3.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (0.3.8)\n",
|
| 27 |
+
"Requirement already satisfied: pandas in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (2.2.3)\n",
|
| 28 |
+
"Requirement already satisfied: requests>=2.32.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (2.32.3)\n",
|
| 29 |
+
"Requirement already satisfied: tqdm>=4.66.3 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (4.67.1)\n",
|
| 30 |
+
"Requirement already satisfied: xxhash in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (3.5.0)\n",
|
| 31 |
+
"Requirement already satisfied: multiprocess<0.70.17 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (0.70.16)\n",
|
| 32 |
+
"Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets->-r requirements.txt (line 1)) (2024.9.0)\n",
|
| 33 |
+
"Requirement already satisfied: aiohttp in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (3.11.11)\n",
|
| 34 |
+
"Requirement already satisfied: huggingface-hub>=0.23.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (0.28.1)\n",
|
| 35 |
+
"Requirement already satisfied: packaging in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (24.2)\n",
|
| 36 |
+
"Requirement already satisfied: pyyaml>=5.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from datasets->-r requirements.txt (line 1)) (6.0.2)\n",
|
| 37 |
+
"Requirement already satisfied: regex!=2019.12.17 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (2024.11.6)\n",
|
| 38 |
+
"Requirement already satisfied: tokenizers<0.22,>=0.21 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (0.21.0)\n",
|
| 39 |
+
"Requirement already satisfied: safetensors>=0.4.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from transformers->-r requirements.txt (line 2)) (0.5.2)\n",
|
| 40 |
+
"Requirement already satisfied: SQLAlchemy<3,>=1.4 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langchain->-r requirements.txt (line 3)) (2.0.37)\n",
|
| 41 |
+
"Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langchain->-r requirements.txt (line 3)) (4.0.3)\n",
|
| 42 |
+
"Requirement already satisfied: langchain-core<0.4.0,>=0.3.31 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langchain->-r requirements.txt (line 3)) (0.3.31)\n",
|
| 43 |
+
"Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.3 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langchain->-r requirements.txt (line 3)) (0.3.5)\n",
|
| 44 |
+
"Requirement already satisfied: langsmith<0.4,>=0.1.17 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langchain->-r requirements.txt (line 3)) (0.3.1)\n",
|
| 45 |
+
"Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langchain->-r requirements.txt (line 3)) (2.10.5)\n",
|
| 46 |
+
"Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langchain->-r requirements.txt (line 3)) (9.0.0)\n",
|
| 47 |
+
"Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langchain-community->-r requirements.txt (line 4)) (0.6.7)\n",
|
| 48 |
+
"Requirement already satisfied: httpx-sse<0.5.0,>=0.4.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langchain-community->-r requirements.txt (line 4)) (0.4.0)\n",
|
| 49 |
+
"Requirement already satisfied: pydantic-settings<3.0.0,>=2.4.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langchain-community->-r requirements.txt (line 4)) (2.7.1)\n",
|
| 50 |
+
"Requirement already satisfied: rich>=13.9.4 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from smolagents->-r requirements.txt (line 5)) (13.9.4)\n",
|
| 51 |
+
"Requirement already satisfied: jinja2>=3.1.4 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from smolagents->-r requirements.txt (line 5)) (3.1.5)\n",
|
| 52 |
+
"Requirement already satisfied: pillow>=11.0.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from smolagents->-r requirements.txt (line 5)) (11.1.0)\n",
|
| 53 |
+
"Requirement already satisfied: markdownify>=0.14.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from smolagents->-r requirements.txt (line 5)) (0.14.1)\n",
|
| 54 |
+
"Requirement already satisfied: duckduckgo-search>=6.3.7 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from smolagents->-r requirements.txt (line 5)) (7.2.1)\n",
|
| 55 |
+
"Requirement already satisfied: python-dotenv in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from smolagents->-r requirements.txt (line 5)) (1.0.1)\n",
|
| 56 |
+
"Requirement already satisfied: click in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from litellm->-r requirements.txt (line 7)) (8.1.8)\n",
|
| 57 |
+
"Requirement already satisfied: httpx<0.28.0,>=0.23.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from litellm->-r requirements.txt (line 7)) (0.27.2)\n",
|
| 58 |
+
"Requirement already satisfied: importlib-metadata>=6.8.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from litellm->-r requirements.txt (line 7)) (8.5.0)\n",
|
| 59 |
+
"Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from litellm->-r requirements.txt (line 7)) (4.23.0)\n",
|
| 60 |
+
"Requirement already satisfied: openai>=1.55.3 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from litellm->-r requirements.txt (line 7)) (1.60.0)\n",
|
| 61 |
+
"Requirement already satisfied: tiktoken>=0.7.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from litellm->-r requirements.txt (line 7)) (0.8.0)\n",
|
| 62 |
+
"Requirement already satisfied: build>=1.0.3 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (1.2.2.post1)\n",
|
| 63 |
+
"Requirement already satisfied: chroma-hnswlib==0.7.6 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (0.7.6)\n",
|
| 64 |
+
"Requirement already satisfied: fastapi>=0.95.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (0.115.6)\n",
|
| 65 |
+
"Requirement already satisfied: uvicorn>=0.18.3 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r requirements.txt (line 8)) (0.34.0)\n",
|
| 66 |
+
"Requirement already satisfied: posthog>=2.4.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (3.9.2)\n",
|
| 67 |
+
"Requirement already satisfied: typing_extensions>=4.5.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (4.12.2)\n",
|
| 68 |
+
"Requirement already satisfied: onnxruntime>=1.14.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (1.20.1)\n",
|
| 69 |
+
"Requirement already satisfied: opentelemetry-api>=1.2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (1.29.0)\n",
|
| 70 |
+
"Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc>=1.2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (1.29.0)\n",
|
| 71 |
+
"Requirement already satisfied: opentelemetry-instrumentation-fastapi>=0.41b0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (0.50b0)\n",
|
| 72 |
+
"Requirement already satisfied: opentelemetry-sdk>=1.2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (1.29.0)\n",
|
| 73 |
+
"Requirement already satisfied: pypika>=0.48.9 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (0.48.9)\n",
|
| 74 |
+
"Requirement already satisfied: overrides>=7.3.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (7.7.0)\n",
|
| 75 |
+
"Requirement already satisfied: importlib-resources in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (6.5.2)\n",
|
| 76 |
+
"Requirement already satisfied: grpcio>=1.58.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (1.69.0)\n",
|
| 77 |
+
"Requirement already satisfied: bcrypt>=4.0.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (4.2.1)\n",
|
| 78 |
+
"Requirement already satisfied: typer>=0.9.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (0.12.5)\n",
|
| 79 |
+
"Requirement already satisfied: kubernetes>=28.1.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (31.0.0)\n",
|
| 80 |
+
"Requirement already satisfied: mmh3>=4.0.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (5.0.1)\n",
|
| 81 |
+
"Requirement already satisfied: orjson>=3.9.12 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from chromadb->-r requirements.txt (line 8)) (3.10.15)\n",
|
| 82 |
+
"Requirement already satisfied: chardet in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from reportlab->-r requirements.txt (line 9)) (5.2.0)\n",
|
| 83 |
+
"Requirement already satisfied: aiofiles<24.0,>=22.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (23.2.1)\n",
|
| 84 |
+
"Requirement already satisfied: anyio<5.0,>=3.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (4.8.0)\n",
|
| 85 |
+
"Requirement already satisfied: ffmpy in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (0.5.0)\n",
|
| 86 |
+
"Requirement already satisfied: gradio-client==1.7.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (1.7.0)\n",
|
| 87 |
+
"Requirement already satisfied: markupsafe~=2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (2.1.5)\n",
|
| 88 |
+
"Requirement already satisfied: pydub in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (0.25.1)\n",
|
| 89 |
+
"Requirement already satisfied: python-multipart>=0.0.18 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (0.0.18)\n",
|
| 90 |
+
"Requirement already satisfied: ruff>=0.9.3 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (0.9.5)\n",
|
| 91 |
+
"Requirement already satisfied: safehttpx<0.2.0,>=0.1.6 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (0.1.6)\n",
|
| 92 |
+
"Requirement already satisfied: semantic-version~=2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (2.10.0)\n",
|
| 93 |
+
"Requirement already satisfied: starlette<1.0,>=0.40.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (0.41.3)\n",
|
| 94 |
+
"Requirement already satisfied: tomlkit<0.14.0,>=0.12.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio->-r requirements.txt (line 10)) (0.13.2)\n",
|
| 95 |
+
"Requirement already satisfied: websockets<15.0,>=10.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from gradio-client==1.7.0->gradio->-r requirements.txt (line 10)) (14.2)\n",
|
| 96 |
+
"Requirement already satisfied: torch>=1.11.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from sentence-transformers->-r requirements.txt (line 11)) (2.6.0)\n",
|
| 97 |
+
"Requirement already satisfied: scikit-learn in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from sentence-transformers->-r requirements.txt (line 11)) (1.6.1)\n",
|
| 98 |
+
"Requirement already satisfied: scipy in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from sentence-transformers->-r requirements.txt (line 11)) (1.11.4)\n",
|
| 99 |
+
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (2.4.4)\n",
|
| 100 |
+
"Requirement already satisfied: aiosignal>=1.1.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (1.3.2)\n",
|
| 101 |
+
"Requirement already satisfied: attrs>=17.3.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (24.3.0)\n",
|
| 102 |
+
"Requirement already satisfied: frozenlist>=1.1.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (1.5.0)\n",
|
| 103 |
+
"Requirement already satisfied: multidict<7.0,>=4.5 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (6.1.0)\n",
|
| 104 |
+
"Requirement already satisfied: propcache>=0.2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (0.2.1)\n",
|
| 105 |
+
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (1.18.3)\n",
|
| 106 |
+
"Requirement already satisfied: exceptiongroup>=1.0.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from anyio<5.0,>=3.0->gradio->-r requirements.txt (line 10)) (1.2.2)\n",
|
| 107 |
+
"Requirement already satisfied: idna>=2.8 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from anyio<5.0,>=3.0->gradio->-r requirements.txt (line 10)) (3.10)\n",
|
| 108 |
+
"Requirement already satisfied: sniffio>=1.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from anyio<5.0,>=3.0->gradio->-r requirements.txt (line 10)) (1.3.1)\n",
|
| 109 |
+
"Requirement already satisfied: pyproject_hooks in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from build>=1.0.3->chromadb->-r requirements.txt (line 8)) (1.2.0)\n",
|
| 110 |
+
"Requirement already satisfied: tomli>=1.1.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from build>=1.0.3->chromadb->-r requirements.txt (line 8)) (2.2.1)\n",
|
| 111 |
+
"Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community->-r requirements.txt (line 4)) (3.26.0)\n",
|
| 112 |
+
"Requirement already satisfied: typing-inspect<1,>=0.4.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community->-r requirements.txt (line 4)) (0.9.0)\n",
|
| 113 |
+
"Requirement already satisfied: primp>=0.10.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from duckduckgo-search>=6.3.7->smolagents->-r requirements.txt (line 5)) (0.10.1)\n",
|
| 114 |
+
"Requirement already satisfied: lxml>=5.3.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from duckduckgo-search>=6.3.7->smolagents->-r requirements.txt (line 5)) (5.3.0)\n",
|
| 115 |
+
"Requirement already satisfied: certifi in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from httpx<0.28.0,>=0.23.0->litellm->-r requirements.txt (line 7)) (2024.12.14)\n",
|
| 116 |
+
"Requirement already satisfied: httpcore==1.* in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from httpx<0.28.0,>=0.23.0->litellm->-r requirements.txt (line 7)) (1.0.7)\n",
|
| 117 |
+
"Requirement already satisfied: h11<0.15,>=0.13 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from httpcore==1.*->httpx<0.28.0,>=0.23.0->litellm->-r requirements.txt (line 7)) (0.14.0)\n",
|
| 118 |
+
"Requirement already satisfied: zipp>=3.20 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from importlib-metadata>=6.8.0->litellm->-r requirements.txt (line 7)) (3.21.0)\n",
|
| 119 |
+
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm->-r requirements.txt (line 7)) (2024.10.1)\n",
|
| 120 |
+
"Requirement already satisfied: referencing>=0.28.4 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm->-r requirements.txt (line 7)) (0.36.1)\n",
|
| 121 |
+
"Requirement already satisfied: rpds-py>=0.7.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm->-r requirements.txt (line 7)) (0.22.3)\n",
|
| 122 |
+
"Requirement already satisfied: six>=1.9.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (1.17.0)\n",
|
| 123 |
+
"Requirement already satisfied: python-dateutil>=2.5.3 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (2.9.0.post0)\n",
|
| 124 |
+
"Requirement already satisfied: google-auth>=1.0.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (2.37.0)\n",
|
| 125 |
+
"Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (1.8.0)\n",
|
| 126 |
+
"Requirement already satisfied: requests-oauthlib in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (2.0.0)\n",
|
| 127 |
+
"Requirement already satisfied: oauthlib>=3.2.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (3.2.2)\n",
|
| 128 |
+
"Requirement already satisfied: urllib3>=1.24.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (2.3.0)\n",
|
| 129 |
+
"Requirement already satisfied: durationpy>=0.7 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (0.9)\n",
|
| 130 |
+
"Requirement already satisfied: jsonpatch<2.0,>=1.33 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langchain-core<0.4.0,>=0.3.31->langchain->-r requirements.txt (line 3)) (1.33)\n",
|
| 131 |
+
"Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langsmith<0.4,>=0.1.17->langchain->-r requirements.txt (line 3)) (1.0.0)\n",
|
| 132 |
+
"Requirement already satisfied: zstandard<0.24.0,>=0.23.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from langsmith<0.4,>=0.1.17->langchain->-r requirements.txt (line 3)) (0.23.0)\n",
|
| 133 |
+
"Requirement already satisfied: beautifulsoup4<5,>=4.9 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from markdownify>=0.14.1->smolagents->-r requirements.txt (line 5)) (4.12.3)\n",
|
| 134 |
+
"Requirement already satisfied: coloredlogs in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from onnxruntime>=1.14.1->chromadb->-r requirements.txt (line 8)) (15.0.1)\n",
|
| 135 |
+
"Requirement already satisfied: flatbuffers in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from onnxruntime>=1.14.1->chromadb->-r requirements.txt (line 8)) (25.1.21)\n",
|
| 136 |
+
"Requirement already satisfied: protobuf in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from onnxruntime>=1.14.1->chromadb->-r requirements.txt (line 8)) (5.29.3)\n",
|
| 137 |
+
"Requirement already satisfied: sympy in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from onnxruntime>=1.14.1->chromadb->-r requirements.txt (line 8)) (1.13.1)\n",
|
| 138 |
+
"Requirement already satisfied: distro<2,>=1.7.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from openai>=1.55.3->litellm->-r requirements.txt (line 7)) (1.9.0)\n",
|
| 139 |
+
"Requirement already satisfied: jiter<1,>=0.4.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from openai>=1.55.3->litellm->-r requirements.txt (line 7)) (0.8.2)\n",
|
| 140 |
+
"Requirement already satisfied: deprecated>=1.2.6 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from opentelemetry-api>=1.2.0->chromadb->-r requirements.txt (line 8)) (1.2.15)\n",
|
| 141 |
+
"Requirement already satisfied: googleapis-common-protos~=1.52 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb->-r requirements.txt (line 8)) (1.66.0)\n",
|
| 142 |
+
"Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.29.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb->-r requirements.txt (line 8)) (1.29.0)\n",
|
| 143 |
+
"Requirement already satisfied: opentelemetry-proto==1.29.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb->-r requirements.txt (line 8)) (1.29.0)\n",
|
| 144 |
+
"Requirement already satisfied: opentelemetry-instrumentation-asgi==0.50b0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r requirements.txt (line 8)) (0.50b0)\n",
|
| 145 |
+
"Requirement already satisfied: opentelemetry-instrumentation==0.50b0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r requirements.txt (line 8)) (0.50b0)\n",
|
| 146 |
+
"Requirement already satisfied: opentelemetry-semantic-conventions==0.50b0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r requirements.txt (line 8)) (0.50b0)\n",
|
| 147 |
+
"Requirement already satisfied: opentelemetry-util-http==0.50b0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r requirements.txt (line 8)) (0.50b0)\n",
|
| 148 |
+
"Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from opentelemetry-instrumentation==0.50b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r requirements.txt (line 8)) (1.17.2)\n",
|
| 149 |
+
"Requirement already satisfied: asgiref~=3.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from opentelemetry-instrumentation-asgi==0.50b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb->-r requirements.txt (line 8)) (3.8.1)\n",
|
| 150 |
+
"Requirement already satisfied: pytz>=2020.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from pandas->datasets->-r requirements.txt (line 1)) (2024.2)\n",
|
| 151 |
+
"Requirement already satisfied: tzdata>=2022.7 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from pandas->datasets->-r requirements.txt (line 1)) (2025.1)\n",
|
| 152 |
+
"Requirement already satisfied: monotonic>=1.5 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from posthog>=2.4.0->chromadb->-r requirements.txt (line 8)) (1.6)\n",
|
| 153 |
+
"Requirement already satisfied: backoff>=1.10.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from posthog>=2.4.0->chromadb->-r requirements.txt (line 8)) (2.2.1)\n",
|
| 154 |
+
"Requirement already satisfied: annotated-types>=0.6.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from pydantic<3.0.0,>=2.7.4->langchain->-r requirements.txt (line 3)) (0.7.0)\n",
|
| 155 |
+
"Requirement already satisfied: pydantic-core==2.27.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from pydantic<3.0.0,>=2.7.4->langchain->-r requirements.txt (line 3)) (2.27.2)\n",
|
| 156 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from requests>=2.32.2->datasets->-r requirements.txt (line 1)) (3.4.1)\n",
|
| 157 |
+
"Requirement already satisfied: markdown-it-py>=2.2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from rich>=13.9.4->smolagents->-r requirements.txt (line 5)) (3.0.0)\n",
|
| 158 |
+
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from rich>=13.9.4->smolagents->-r requirements.txt (line 5)) (2.19.1)\n",
|
| 159 |
+
"Requirement already satisfied: greenlet!=0.4.17 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from SQLAlchemy<3,>=1.4->langchain->-r requirements.txt (line 3)) (3.1.1)\n",
|
| 160 |
+
"Requirement already satisfied: networkx in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (3.4.2)\n",
|
| 161 |
+
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (12.4.127)\n",
|
| 162 |
+
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (12.4.127)\n",
|
| 163 |
+
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (12.4.127)\n",
|
| 164 |
+
"Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (9.1.0.70)\n",
|
| 165 |
+
"Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (12.4.5.8)\n",
|
| 166 |
+
"Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (11.2.1.3)\n",
|
| 167 |
+
"Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (10.3.5.147)\n",
|
| 168 |
+
"Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (11.6.1.9)\n",
|
| 169 |
+
"Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (12.3.1.170)\n",
|
| 170 |
+
"Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (0.6.2)\n",
|
| 171 |
+
"Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (2.21.5)\n",
|
| 172 |
+
"Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (12.4.127)\n",
|
| 173 |
+
"Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (12.4.127)\n",
|
| 174 |
+
"Requirement already satisfied: triton==3.2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers->-r requirements.txt (line 11)) (3.2.0)\n",
|
| 175 |
+
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from sympy->onnxruntime>=1.14.1->chromadb->-r requirements.txt (line 8)) (1.3.0)\n",
|
| 176 |
+
"Requirement already satisfied: shellingham>=1.3.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from typer>=0.9.0->chromadb->-r requirements.txt (line 8)) (1.5.4)\n",
|
| 177 |
+
"Requirement already satisfied: httptools>=0.6.3 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r requirements.txt (line 8)) (0.6.4)\n",
|
| 178 |
+
"Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r requirements.txt (line 8)) (0.21.0)\n",
|
| 179 |
+
"Requirement already satisfied: watchfiles>=0.13 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from uvicorn[standard]>=0.18.3->chromadb->-r requirements.txt (line 8)) (0.20.0)\n",
|
| 180 |
+
"Requirement already satisfied: joblib>=1.2.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from scikit-learn->sentence-transformers->-r requirements.txt (line 11)) (1.4.2)\n",
|
| 181 |
+
"Requirement already satisfied: threadpoolctl>=3.1.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from scikit-learn->sentence-transformers->-r requirements.txt (line 11)) (3.5.0)\n",
|
| 182 |
+
"Requirement already satisfied: soupsieve>1.2 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from beautifulsoup4<5,>=4.9->markdownify>=0.14.1->smolagents->-r requirements.txt (line 5)) (2.6)\n",
|
| 183 |
+
"Requirement already satisfied: cachetools<6.0,>=2.0.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (5.5.1)\n",
|
| 184 |
+
"Requirement already satisfied: pyasn1-modules>=0.2.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (0.4.1)\n",
|
| 185 |
+
"Requirement already satisfied: rsa<5,>=3.1.4 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (4.9)\n",
|
| 186 |
+
"Requirement already satisfied: jsonpointer>=1.9 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.31->langchain->-r requirements.txt (line 3)) (3.0.0)\n",
|
| 187 |
+
"Requirement already satisfied: mdurl~=0.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=13.9.4->smolagents->-r requirements.txt (line 5)) (0.1.2)\n",
|
| 188 |
+
"Requirement already satisfied: mypy-extensions>=0.3.0 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community->-r requirements.txt (line 4)) (1.0.0)\n",
|
| 189 |
+
"Requirement already satisfied: humanfriendly>=9.1 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from coloredlogs->onnxruntime>=1.14.1->chromadb->-r requirements.txt (line 8)) (10.0)\n",
|
| 190 |
+
"Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb->-r requirements.txt (line 8)) (0.6.1)\n",
|
| 191 |
+
"Note: you may need to restart the kernel to use updated packages.\n"
|
| 192 |
+
]
|
| 193 |
+
}
|
| 194 |
+
],
|
| 195 |
+
"source": [
|
| 196 |
+
"%pip install -r requirements.txt"
|
| 197 |
+
]
|
| 198 |
+
},
|
| 199 |
+
{
|
| 200 |
+
"cell_type": "code",
|
| 201 |
+
"execution_count": 18,
|
| 202 |
+
"metadata": {},
|
| 203 |
+
"outputs": [],
|
| 204 |
+
"source": [
|
| 205 |
+
"import os\n",
|
| 206 |
+
"from dotenv import load_dotenv\n",
|
| 207 |
+
"\n",
|
| 208 |
+
"_ = load_dotenv()\n",
|
| 209 |
+
"\n",
|
| 210 |
+
"LLM_BASE = os.getenv(\"AZURE_OPENAI_BASE\")\n",
|
| 211 |
+
"LLM_VERSION = os.getenv(\"AZURE_OPENAI_VERSION\")\n",
|
| 212 |
+
"LLMI_API_KEY = os.getenv(\"AZURE_OPENAI_API_KEY\")\n",
|
| 213 |
+
"LLM_NAME = os.getenv(\"AZURE_OPENAI_MODEL\")\n",
|
| 214 |
+
"\n",
|
| 215 |
+
"EMBEDDING_BASE = os.getenv(\"AZURE_OPENAI_EMBEDDING_BASE\")\n",
|
| 216 |
+
"EMBEDDING_VERSION = os.getenv(\"AZURE_OPENAI_EMBEDDING_VERSION\")\n",
|
| 217 |
+
"EMBEDDING_API_KEY = os.getenv(\"AZURE_OPENAI_EMBEDDING_API_KEY\")\n",
|
| 218 |
+
"EMBEDDING_NAME = os.getenv(\"AZURE_OPENAI_EMBEDDING_MODEL\")\n",
|
| 219 |
+
"\n",
|
| 220 |
+
"# Path to the dataset.\n",
|
| 221 |
+
"DATA_PATH = os.path.join(\"papers\")\n",
|
| 222 |
+
"# Path to save Chroma database\n",
|
| 223 |
+
"CHROMA_PATH = \"./chroma_db\"\n"
|
| 224 |
+
]
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"cell_type": "code",
|
| 228 |
+
"execution_count": 19,
|
| 229 |
+
"metadata": {},
|
| 230 |
+
"outputs": [],
|
| 231 |
+
"source": [
|
| 232 |
+
"from uuid import uuid4\n",
|
| 233 |
+
"from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter\n",
|
| 234 |
+
"from langchain.document_loaders.pdf import PyPDFDirectoryLoader"
|
| 235 |
+
]
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"cell_type": "markdown",
|
| 239 |
+
"metadata": {},
|
| 240 |
+
"source": [
|
| 241 |
+
"### Read documents and preprocess"
|
| 242 |
+
]
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"cell_type": "code",
|
| 246 |
+
"execution_count": 20,
|
| 247 |
+
"metadata": {},
|
| 248 |
+
"outputs": [],
|
| 249 |
+
"source": [
|
| 250 |
+
"# Initialize text splitter with specified parameters\n",
|
| 251 |
+
"# text_splitter = RecursiveCharacterTextSplitter(\n",
|
| 252 |
+
"# chunk_size=1000, # Size of each chunk in characters\n",
|
| 253 |
+
"# chunk_overlap=100, # Overlap between consecutive chunks\n",
|
| 254 |
+
"# length_function=len, # Function to compute the length of the text\n",
|
| 255 |
+
"# add_start_index=True, # Flag to add start index to each chunk\n",
|
| 256 |
+
"# )\n",
|
| 257 |
+
"\n",
|
| 258 |
+
"# Load and split PDF documents\n",
|
| 259 |
+
"# document_loader = PyPDFDirectoryLoader(DATA_PATH)\n",
|
| 260 |
+
"# documents = document_loader.load()\n",
|
| 261 |
+
"# chunks = text_splitter.split_documents(documents)\n",
|
| 262 |
+
"# print(f\"Total chunks extracted from data folder: {len(chunks)}\")\n"
|
| 263 |
+
]
|
| 264 |
+
},
|
| 265 |
+
{
|
| 266 |
+
"cell_type": "markdown",
|
| 267 |
+
"metadata": {},
|
| 268 |
+
"source": [
|
| 269 |
+
"### Create ChromaDB using embeddings."
|
| 270 |
+
]
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"cell_type": "code",
|
| 274 |
+
"execution_count": 21,
|
| 275 |
+
"metadata": {},
|
| 276 |
+
"outputs": [
|
| 277 |
+
{
|
| 278 |
+
"name": "stdout",
|
| 279 |
+
"output_type": "stream",
|
| 280 |
+
"text": [
|
| 281 |
+
"[array([-0.02441968, -0.00315892, -0.00488673, ..., 0.006077 ,\n",
|
| 282 |
+
" -0.01348737, -0.00549059], dtype=float32)]\n"
|
| 283 |
+
]
|
| 284 |
+
}
|
| 285 |
+
],
|
| 286 |
+
"source": [
|
| 287 |
+
"import chromadb.utils.embedding_functions as embedding_functions\n",
|
| 288 |
+
"\n",
|
| 289 |
+
"openai_embedding = embedding_functions.OpenAIEmbeddingFunction(\n",
|
| 290 |
+
" api_key=EMBEDDING_API_KEY,\n",
|
| 291 |
+
" api_base=EMBEDDING_BASE,\n",
|
| 292 |
+
" api_type=\"azure\",\n",
|
| 293 |
+
" api_version=EMBEDDING_VERSION,\n",
|
| 294 |
+
" model_name=EMBEDDING_NAME.split(\"/\")[-1],\n",
|
| 295 |
+
")\n",
|
| 296 |
+
"print(openai_embedding([\"Hola\"]))"
|
| 297 |
+
]
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"cell_type": "code",
|
| 301 |
+
"execution_count": 27,
|
| 302 |
+
"metadata": {},
|
| 303 |
+
"outputs": [
|
| 304 |
+
{
|
| 305 |
+
"data": {
|
| 306 |
+
"text/plain": [
|
| 307 |
+
"['LLM_PAPERS', 'RENTA_2023', 'RENTA_2023_2']"
|
| 308 |
+
]
|
| 309 |
+
},
|
| 310 |
+
"execution_count": 27,
|
| 311 |
+
"metadata": {},
|
| 312 |
+
"output_type": "execute_result"
|
| 313 |
+
}
|
| 314 |
+
],
|
| 315 |
+
"source": [
|
| 316 |
+
"import chromadb\n",
|
| 317 |
+
"chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)\n",
|
| 318 |
+
"# chroma_client.delete_collection('RENTA_2023')\n",
|
| 319 |
+
"chroma_client.list_collections()"
|
| 320 |
+
]
|
| 321 |
+
},
|
| 322 |
+
{
|
| 323 |
+
"cell_type": "code",
|
| 324 |
+
"execution_count": 28,
|
| 325 |
+
"metadata": {},
|
| 326 |
+
"outputs": [],
|
| 327 |
+
"source": [
|
| 328 |
+
"import re\n",
|
| 329 |
+
"import random\n",
|
| 330 |
+
"\n",
|
| 331 |
+
"def extract_page_number(text):\n",
|
| 332 |
+
" \"\"\"\n",
|
| 333 |
+
" Extrae el número de página del texto si tiene el formato:\n",
|
| 334 |
+
" <!-- PageNumber=\"Página 9\" -->\n",
|
| 335 |
+
" \"\"\"\n",
|
| 336 |
+
" patron = r\"<!-- PageNumber=\\\"Página (\\d+)\\\" -->\"\n",
|
| 337 |
+
" match = re.search(patron, text)\n",
|
| 338 |
+
" if match:\n",
|
| 339 |
+
" return str(match.group(1))\n",
|
| 340 |
+
" return None\n",
|
| 341 |
+
"\n",
|
| 342 |
+
"# Función para procesar un archivo Markdown usando MarkdownHeaderTextSplitter\n",
|
| 343 |
+
"def process_markdown_with_header_splitter(file_path):\n",
|
| 344 |
+
" \"\"\"\n",
|
| 345 |
+
" Procesa un archivo Markdown usando MarkdownHeaderTextSplitter.\n",
|
| 346 |
+
" \"\"\"\n",
|
| 347 |
+
" # Leer el archivo Markdown\n",
|
| 348 |
+
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
| 349 |
+
" markdown_text = file.read()\n",
|
| 350 |
+
"\n",
|
| 351 |
+
" # Definir los niveles de encabezado a extraer como metadatos\n",
|
| 352 |
+
" headers_to_split_on = [\n",
|
| 353 |
+
" (\"#\", \"Header 1\"),\n",
|
| 354 |
+
" (\"##\", \"Header 2\"),\n",
|
| 355 |
+
" (\"###\", \"Header 3\"),\n",
|
| 356 |
+
" (\"####\", \"Header 4\"),\n",
|
| 357 |
+
" (\"#####\", \"Header 5\"),\n",
|
| 358 |
+
" (\"######\", \"Header 6\")\n",
|
| 359 |
+
" ]\n",
|
| 360 |
+
"\n",
|
| 361 |
+
" # Inicializar el splitter\n",
|
| 362 |
+
" markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
|
| 363 |
+
"\n",
|
| 364 |
+
" # Dividir el contenido en chunks\n",
|
| 365 |
+
" chunks = markdown_splitter.split_text(markdown_text)\n",
|
| 366 |
+
"\n",
|
| 367 |
+
" chunk_size = 1000\n",
|
| 368 |
+
" chunk_overlap = 200\n",
|
| 369 |
+
" text_splitter = RecursiveCharacterTextSplitter(\n",
|
| 370 |
+
" chunk_size=chunk_size, chunk_overlap=chunk_overlap\n",
|
| 371 |
+
" )\n",
|
| 372 |
+
"\n",
|
| 373 |
+
" chunks = text_splitter.split_documents(chunks)\n",
|
| 374 |
+
"\n",
|
| 375 |
+
" # Procesar los chunks para añadir números de página y otros ajustes\n",
|
| 376 |
+
" processed_chunks = []\n",
|
| 377 |
+
" previous_page_number = None\n",
|
| 378 |
+
" for chunk in chunks:\n",
|
| 379 |
+
" # Extraer número de página\n",
|
| 380 |
+
" page_number = extract_page_number(chunk.page_content)\n",
|
| 381 |
+
" if page_number is None:\n",
|
| 382 |
+
" page_number = previous_page_number\n",
|
| 383 |
+
" else:\n",
|
| 384 |
+
" previous_page_number = page_number\n",
|
| 385 |
+
"\n",
|
| 386 |
+
" # Añadir chunk procesado\n",
|
| 387 |
+
" processed_chunks.append({\n",
|
| 388 |
+
" \"content\": chunk.page_content,\n",
|
| 389 |
+
" \"metadata\": {**chunk.metadata, \"page\": page_number} if chunk.metadata else {\"page\": page_number}\n",
|
| 390 |
+
" })\n",
|
| 391 |
+
"\n",
|
| 392 |
+
" return processed_chunks\n",
|
| 393 |
+
"\n",
|
| 394 |
+
"# Función para probar una muestra aleatoria de chunks\n",
|
| 395 |
+
"def test_random_chunks(chunks, num_samples=5):\n",
|
| 396 |
+
" \"\"\"\n",
|
| 397 |
+
" Imprime una muestra aleatoria de chunks para verificar el parseo.\n",
|
| 398 |
+
" \"\"\"\n",
|
| 399 |
+
" if not chunks:\n",
|
| 400 |
+
" print(\"No hay chunks disponibles para probar.\")\n",
|
| 401 |
+
" return\n",
|
| 402 |
+
"\n",
|
| 403 |
+
" print(f\"Mostrando {num_samples} chunks aleatorios para prueba:\")\n",
|
| 404 |
+
" sampled_chunks = random.sample(chunks, min(num_samples, len(chunks)))\n",
|
| 405 |
+
" for i, chunk in enumerate(sampled_chunks, start=1):\n",
|
| 406 |
+
" print(f\"\\n--- Chunk {i} ---\")\n",
|
| 407 |
+
" print(\"Contenido:\")\n",
|
| 408 |
+
" print(chunk[\"content\"])\n",
|
| 409 |
+
" print(\"Metadatos:\")\n",
|
| 410 |
+
" print(chunk[\"metadata\"])\n",
|
| 411 |
+
" print(\"-\" * 40)\n",
|
| 412 |
+
"\n",
|
| 413 |
+
"# Función para añadir chunks a ChromaDB\n",
|
| 414 |
+
"def add_chunks_to_chroma(chunks):\n",
|
| 415 |
+
" \"\"\"\n",
|
| 416 |
+
" Añade los chunks y sus embeddings a la colección de ChromaDB.\n",
|
| 417 |
+
" \"\"\"\n",
|
| 418 |
+
" chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)\n",
|
| 419 |
+
" collection = chroma_client.get_or_create_collection(\n",
|
| 420 |
+
" name=\"RENTA_2023_2\",\n",
|
| 421 |
+
" embedding_function=openai_embedding,\n",
|
| 422 |
+
" )\n",
|
| 423 |
+
"\n",
|
| 424 |
+
" print(f\"Chunks actuales en la colección: {collection.count()}\")\n",
|
| 425 |
+
"\n",
|
| 426 |
+
" if collection.count() == 0:\n",
|
| 427 |
+
" # Generar datos en un solo bucle para mejorar la eficiencia\n",
|
| 428 |
+
" data = [\n",
|
| 429 |
+
" (\n",
|
| 430 |
+
" str(uuid4()), # ID único\n",
|
| 431 |
+
" openai_embedding([chunk[\"content\"]])[0], # Embedding\n",
|
| 432 |
+
" chunk[\"metadata\"], # Metadatos\n",
|
| 433 |
+
" chunk[\"content\"] # Contenido del documento\n",
|
| 434 |
+
" )\n",
|
| 435 |
+
" for chunk in chunks\n",
|
| 436 |
+
" ]\n",
|
| 437 |
+
"\n",
|
| 438 |
+
" # Desempaquetar los datos generados\n",
|
| 439 |
+
" docs_ids, docs_embeddings, docs_metadata, docs = zip(*data)\n",
|
| 440 |
+
"\n",
|
| 441 |
+
" # Crear lotes para la inserción\n",
|
| 442 |
+
" batches = create_batches(\n",
|
| 443 |
+
" api=chroma_client,\n",
|
| 444 |
+
" ids=list(docs_ids),\n",
|
| 445 |
+
" documents=list(docs),\n",
|
| 446 |
+
" embeddings=list(docs_embeddings),\n",
|
| 447 |
+
" metadatas=list(docs_metadata)\n",
|
| 448 |
+
" )\n",
|
| 449 |
+
" # Insertar los datos en Chroma DB\n",
|
| 450 |
+
" for batch in batches:\n",
|
| 451 |
+
" collection.add(\n",
|
| 452 |
+
" documents=batch[3],\n",
|
| 453 |
+
" ids=batch[0],\n",
|
| 454 |
+
" metadatas=batch[2],\n",
|
| 455 |
+
" embeddings=batch[1],\n",
|
| 456 |
+
" )\n",
|
| 457 |
+
" print(f\"Guardados {len(batch[3])}/{collection.count()} chunks en {CHROMA_PATH}.\")"
|
| 458 |
+
]
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"cell_type": "code",
|
| 462 |
+
"execution_count": 30,
|
| 463 |
+
"metadata": {},
|
| 464 |
+
"outputs": [
|
| 465 |
+
{
|
| 466 |
+
"name": "stdout",
|
| 467 |
+
"output_type": "stream",
|
| 468 |
+
"text": [
|
| 469 |
+
"Total chunks extraídos: 1478\n",
|
| 470 |
+
"Mostrando 5 chunks aleatorios para prueba:\n",
|
| 471 |
+
"\n",
|
| 472 |
+
"--- Chunk 1 ---\n",
|
| 473 |
+
"Contenido:\n",
|
| 474 |
+
"turísticos, que puedan llevar a cabo las entidades estatales, autonómicas o locales con\n",
|
| 475 |
+
"competencias en materia de cultura, turismo y economía, así como por las Film Commissions\n",
|
| 476 |
+
"o Film Offices que hayan intervenido en la realización del rodaje o producción. \n",
|
| 477 |
+
"Atención: los requisitos establecidos en las letras b') y c') no son exigibles en el caso de\n",
|
| 478 |
+
"producciones extranjeras de largometrajes cinematográficos y obras audiovisuales en\n",
|
| 479 |
+
"las que el contrato por el que se encargó la ejecución de la producción se hubiese\n",
|
| 480 |
+
"firmado con anterioridad al 11 de julio de 2021, por aplicación de lo dispuesto en la\n",
|
| 481 |
+
"disposición transitoria cuadragésima segunda de la LIS. \n",
|
| 482 |
+
"Límites \n",
|
| 483 |
+
"Esta deducción queda excluida del límite conjunto (25/50 por 100) previsto el artículo\n",
|
| 484 |
+
"39.1 LIS para las deducciones para incentivar la realización de determinadas actividades y,\n",
|
| 485 |
+
"por tanto, esta deducción por gastos de ejecución de una producción extranjera no se\n",
|
| 486 |
+
"computa para el cálculo de dicho límite. \n",
|
| 487 |
+
"Finalmente, téngase en cuenta que, en caso de insuficiencia de cuota en la aplicación de\n",
|
| 488 |
+
"esta deducción del artículo 36.2 de la LIS, la posibilidad de poder solicitar su abono a la\n",
|
| 489 |
+
"Administración que se concede en el artículo 39.3 de la LIS, no es aplicable al IRPF, de\n",
|
| 490 |
+
"acuerdo con lo dispuesto en el artículo 68.2 de la Ley del IRPF. \n",
|
| 491 |
+
"<!-- PageNumber=\"Página 1220\" -->\n",
|
| 492 |
+
"<!-- PageBreak --> \n",
|
| 493 |
+
"<!-- PageHeader=\"Deducciones por incentivos y estímulos a la inversión empresarial en actividades ecc\" -->\n",
|
| 494 |
+
"Metadatos:\n",
|
| 495 |
+
"{'Header 1': 'Capítulo 16. Deducciones generales de la cuota en el ejercicio 2023', 'Header 2': 'Deducciones por incentivos y estímulos a la inversión empresarial en actividades económicas en estimación directa', 'Header 3': 'Régimen general y regímenes especiales de deducciones por incentivos y estímulos a la inversión empresarial de la Ley del Impuesto sobre Sociedades', 'Header 4': '2. Régimen general de deducciones', 'Header 5': 'Artículo 36 LIS \"Deducción por inversiones en producciones cinematográficas, series audiovisuales y espectáculos en vivo de artes escénicas y musicales\"', 'Header 6': 'B. Deducción por producciones cinematográficas extranjeras en España (art. 36.2 LIS)', 'page': '1220'}\n",
|
| 496 |
+
"----------------------------------------\n",
|
| 497 |
+
"\n",
|
| 498 |
+
"--- Chunk 2 ---\n",
|
| 499 |
+
"Contenido:\n",
|
| 500 |
+
"A. Delimitación positiva \n",
|
| 501 |
+
"Normativa: Art. 21 Ley IRPF \n",
|
| 502 |
+
"Tienen la consideración fiscal de rendimientos del capital mobiliario todas las utilidades\n",
|
| 503 |
+
"o contraprestaciones, cualquiera que sea su denominación o naturaleza, dinerarias o en\n",
|
| 504 |
+
"especie, que provengan, directa o indirectamente, del capital mobiliario y, en general, de\n",
|
| 505 |
+
"bienes o derechos no clasificados como inmobiliarios, de los que sea titular el contribuyente y\n",
|
| 506 |
+
"no se hallen afectos a actividades económicas realizadas por él mismo. \n",
|
| 507 |
+
"Los rendimientos correspondientes a los elementos patrimoniales, bienes o derechos, que se\n",
|
| 508 |
+
"hallen afectos de manera exclusiva a actividades económicas realizadas por el contribuyente\n",
|
| 509 |
+
"se comprenderán entre los procedentes de las indicadas actividades. \n",
|
| 510 |
+
"Importante: en ningún caso tienen la consideración de elementos patrimoniales afectos\n",
|
| 511 |
+
"a actividades económicas, los activos representativos de la participación en fondos\n",
|
| 512 |
+
"propios de una entidad y de la cesión de capitales a terceros [Art. 29.1 c) Ley IRPF]. \n",
|
| 513 |
+
"B. Delimitación negativa \n",
|
| 514 |
+
"<!-- PageNumber=\"Página 298\" -->\n",
|
| 515 |
+
"<!-- PageBreak --> \n",
|
| 516 |
+
"<!-- PageHeader=\"Rendimientos del capital mobiliario: cuestiones generales\" --> \n",
|
| 517 |
+
"No se consideran rendimientos del capital mobiliario, entre otros: \n",
|
| 518 |
+
"· Los derivados de la entrega de acciones liberadas. \n",
|
| 519 |
+
"Normativa: Arts. 25.1 b) y 37.1 a) y b) Ley IRPF \n",
|
| 520 |
+
"El tratamiento fiscal de la recepción de acciones liberadas tanto en el caso de valores\n",
|
| 521 |
+
"admitidos a negociación como de valores no admitidos a negociación se comenta en el\n",
|
| 522 |
+
"Capítulo 11. \n",
|
| 523 |
+
"Atención: desde el 1 de enero de 2017, el importe obtenido por la transmisión tanto\n",
|
| 524 |
+
"de derechos de suscripción procedentes de valores admitidos a negociación como\n",
|
| 525 |
+
"de derechos de suscripción procedentes de valores no admitidos a negociación en\n",
|
| 526 |
+
"un mercado organizado tiene la consideración de ganancia patrimonial sujeta a\n",
|
| 527 |
+
"retención. Véase el Capítulo 11. \n",
|
| 528 |
+
"· Los dividendos y participaciones en beneficios distribuidos por sociedades que\n",
|
| 529 |
+
"procedan de períodos impositivos durante los cuales dichas sociedades se hallasen\n",
|
| 530 |
+
"en régimen de transparencia fiscal. \n",
|
| 531 |
+
"Normativa: Art. 91.9 y disposición transitoria décima Ley IRPF; disposición\n",
|
| 532 |
+
"transitoria cuarta Reglamento IRPF \n",
|
| 533 |
+
"· La contraprestación obtenida por el aplazamiento o el fraccionamiento del precio de\n",
|
| 534 |
+
"las operaciones realizadas en el desarrollo de una actividad económica habitual del\n",
|
| 535 |
+
"contribuyente. \n",
|
| 536 |
+
"Normativa: Art. 25.5 Ley IRPF \n",
|
| 537 |
+
"· Los derivados de las transmisiones lucrativas, por causa de muerte del contribuyente,\n",
|
| 538 |
+
"de los activos representativos de la captación y utilización de capitales ajenos. Tampoco\n",
|
| 539 |
+
"se computará el rendimiento del capital mobiliario negativo derivado de la transmisión\n",
|
| 540 |
+
"lucrativa de activos representativos de la captación y utilización de capitales ajenos\n",
|
| 541 |
+
"por actos \"inter vivos\". \n",
|
| 542 |
+
"Normativa: Art. 25.6 Ley IRPF \n",
|
| 543 |
+
"· Los dividendos y participaciones en beneficios a que se refiere el artículo 25.1 a) y b)\n",
|
| 544 |
+
"Ley IRPF que procedan de beneficios obtenidos en períodos impositivos durante los\n",
|
| 545 |
+
"cuales la entidad que los distribuye hubiera tributado en el régimen de las\n",
|
| 546 |
+
"sociedades patrimoniales \n",
|
| 547 |
+
"Normativa: Disposición transitoria decima Ley IRPF \n",
|
| 548 |
+
"· La distribución de los beneficios a que se refiere el artículo 25.1 a) y b) Ley IRPF\n",
|
| 549 |
+
"obtenidos durante los períodos impositivos en los que hubiera sido de aplicación el\n",
|
| 550 |
+
"régimen de atribución de rentas por sociedades civiles que hubieran llevado contabilidad\n",
|
| 551 |
+
"ajustada al código de comercio en los ejercicios 2014 y 2015 y que pasaron a tener la \n",
|
| 552 |
+
"<!-- PageNumber=\"Página 299\" -->\n",
|
| 553 |
+
"<!-- PageBreak --> \n",
|
| 554 |
+
"<!-- PageHeader=\"Capítulo 5. Rendimientos del capital mobiliario\" --> \n",
|
| 555 |
+
"consideración de contribuyentes del Impuesto sobre Sociedades a partir de 1 de enero de\n",
|
| 556 |
+
"2016, no se integrarán en la base imponible del perceptor que sea contribuyente del\n",
|
| 557 |
+
"IRPF, ni estarán sujetos a retención e ingreso a cuenta. \n",
|
| 558 |
+
"Normativa: Disposición transitoria trigésima segunda.3 LIS\n",
|
| 559 |
+
"Metadatos:\n",
|
| 560 |
+
"{'Header 1': 'Capítulo 5. Rendimientos del capital mobiliario', 'Header 2': 'Rendimientos del capital mobiliario: cuestiones generales', 'Header 3': 'Concepto', 'Header 4': 'Delimitación positiva y negativa', 'page': '298'}\n",
|
| 561 |
+
"----------------------------------------\n",
|
| 562 |
+
"\n",
|
| 563 |
+
"--- Chunk 3 ---\n",
|
| 564 |
+
"Contenido:\n",
|
| 565 |
+
"Provincia: Valencia \n",
|
| 566 |
+
"<table>\n",
|
| 567 |
+
"<tr>\n",
|
| 568 |
+
"<th>Ámbito territorial</th>\n",
|
| 569 |
+
"<th>Actividad</th>\n",
|
| 570 |
+
"<th>Índice rendimiento neto</th>\n",
|
| 571 |
+
"</tr>\n",
|
| 572 |
+
"<tr>\n",
|
| 573 |
+
"<td rowspan=\"4\">Todos los términos municipales (excepto los que se detallen con un índice inferior).</td>\n",
|
| 574 |
+
"<td>Cereales: Avena.</td>\n",
|
| 575 |
+
"<td>0,05</td>\n",
|
| 576 |
+
"</tr>\n",
|
| 577 |
+
"<tr>\n",
|
| 578 |
+
"<td>Cereales: Cebada.</td>\n",
|
| 579 |
+
"<td>0,05</td>\n",
|
| 580 |
+
"</tr>\n",
|
| 581 |
+
"<tr>\n",
|
| 582 |
+
"<td>Cereales: Centeno.</td>\n",
|
| 583 |
+
"<td>0,05</td>\n",
|
| 584 |
+
"</tr>\n",
|
| 585 |
+
"<tr>\n",
|
| 586 |
+
"<td>Cereales: Trigo.</td>\n",
|
| 587 |
+
"<td>0,05</td>\n",
|
| 588 |
+
"</tr>\n",
|
| 589 |
+
"</table> \n",
|
| 590 |
+
"<!-- PageNumber=\"Página 814\" -->\n",
|
| 591 |
+
"<!-- PageBreak --> \n",
|
| 592 |
+
"<!-- PageHeader=\"Apéndice: Relación de productos naturales, servicios y actividades accesorios realiza\" --> \n",
|
| 593 |
+
"Provincia: Valencia \n",
|
| 594 |
+
"<table>\n",
|
| 595 |
+
"<tr>\n",
|
| 596 |
+
"<th>Ámbito territorial</th>\n",
|
| 597 |
+
"<th>Actividad</th>\n",
|
| 598 |
+
"<th>Índice rendimiento neto</th>\n",
|
| 599 |
+
"</tr>\n",
|
| 600 |
+
"<tr>\n",
|
| 601 |
+
"<td rowspan=\"6\"></td>\n",
|
| 602 |
+
"<td>Forrajes.</td>\n",
|
| 603 |
+
"<td>0,26</td>\n",
|
| 604 |
+
"</tr>\n",
|
| 605 |
+
"<tr>\n",
|
| 606 |
+
"<td>Frutos no cítricos: Albaricoque.</td>\n",
|
| 607 |
+
"<td>0,19</td>\n",
|
| 608 |
+
"</tr>\n",
|
| 609 |
+
"<tr>\n",
|
| 610 |
+
"<td>Frutos no cítricos: Ciruela.</td>\n",
|
| 611 |
+
"<td>0,19</td>\n",
|
| 612 |
+
"</tr>\n",
|
| 613 |
+
"<tr>\n",
|
| 614 |
+
"<td>Leguminosas.</td>\n",
|
| 615 |
+
"<td>0,09</td>\n",
|
| 616 |
+
"</tr>\n",
|
| 617 |
+
"<tr>\n",
|
| 618 |
+
"<td>Uva para vino sin D.O.</td>\n",
|
| 619 |
+
"<td>0,13</td>\n",
|
| 620 |
+
"</tr>\n",
|
| 621 |
+
"<tr>\n",
|
| 622 |
+
"<td>Uva para vino con D.O.</td>\n",
|
| 623 |
+
"<td>0,16</td>\n",
|
| 624 |
+
"</tr>\n",
|
| 625 |
+
"<tr>\n",
|
| 626 |
+
"<td rowspan=\"3\">Términos municipales de: Ademuz.</td>\n",
|
| 627 |
+
"<td>Forrajes: Veza.</td>\n",
|
| 628 |
+
"<td>0,07</td>\n",
|
| 629 |
+
"</tr>\n",
|
| 630 |
+
"<tr>\n",
|
| 631 |
+
"<td>Frutos no cítricos: Manzana.</td>\n",
|
| 632 |
+
"<td>0,13</td>\n",
|
| 633 |
+
"</tr>\n",
|
| 634 |
+
"<tr>\n",
|
| 635 |
+
"<td>Frutos secos: Almendra.</td>\n",
|
| 636 |
+
"<td>0,05</td>\n",
|
| 637 |
+
"</tr>\n",
|
| 638 |
+
"<tr>\n",
|
| 639 |
+
"<td rowspan=\"4\">Términos municipales de: Ador y Xeraco.</td>\n",
|
| 640 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 641 |
+
"<td>0,18</td>\n",
|
| 642 |
+
"</tr>\n",
|
| 643 |
+
"<tr>\n",
|
| 644 |
+
"<td>Frutos no cítricos: Aguacate.</td>\n",
|
| 645 |
+
"<td>0,26</td>\n",
|
| 646 |
+
"</tr>\n",
|
| 647 |
+
"<tr>\n",
|
| 648 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 649 |
+
"<td>0,26</td>\n",
|
| 650 |
+
"</tr>\n",
|
| 651 |
+
"<tr>\n",
|
| 652 |
+
"<td>Otros productos agrícolas: Viveros.</td>\n",
|
| 653 |
+
"<td>0,26</td>\n",
|
| 654 |
+
"</tr>\n",
|
| 655 |
+
"<tr>\n",
|
| 656 |
+
"<td rowspan=\"2\">Términos municipales de: Atzeneta d'Albaida, Benissoda y Bufali.</td>\n",
|
| 657 |
+
"<td>Frutos no cítricos: Albaricoque.</td>\n",
|
| 658 |
+
"<td>0,07</td>\n",
|
| 659 |
+
"</tr>\n",
|
| 660 |
+
"<tr>\n",
|
| 661 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 662 |
+
"<td>0,19</td>\n",
|
| 663 |
+
"</tr>\n",
|
| 664 |
+
"<tr>\n",
|
| 665 |
+
"<td rowspan=\"2\">Términos municipales de: Agullent.</td>\n",
|
| 666 |
+
"<td>Frutos no cítricos: Albaricoque.</td>\n",
|
| 667 |
+
"<td>0,07</td>\n",
|
| 668 |
+
"</tr>\n",
|
| 669 |
+
"<tr>\n",
|
| 670 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 671 |
+
"<td>0,19</td>\n",
|
| 672 |
+
"</tr>\n",
|
| 673 |
+
"</table> \n",
|
| 674 |
+
"<!-- PageNumber=\"Página 815\" -->\n",
|
| 675 |
+
"<!-- PageBreak --> \n",
|
| 676 |
+
"<!-- PageHeader=\"Capítulo 9. Rendimientos de actividades económicas en estimación objetiva (II) (Acti\" --> \n",
|
| 677 |
+
"Provincia: Valencia \n",
|
| 678 |
+
"<table>\n",
|
| 679 |
+
"<tr>\n",
|
| 680 |
+
"<th>Ámbito territorial</th>\n",
|
| 681 |
+
"<th>Actividad</th>\n",
|
| 682 |
+
"<th>Índice rendimiento neto</th>\n",
|
| 683 |
+
"</tr>\n",
|
| 684 |
+
"<tr>\n",
|
| 685 |
+
"<td>Términos municipales de: Alaquàs, Bellús, Càrcer, Navarrés, Riba-roja de Túria y Sagunto/Sagunt.</td>\n",
|
| 686 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 687 |
+
"<td>0,26</td>\n",
|
| 688 |
+
"</tr>\n",
|
| 689 |
+
"<tr>\n",
|
| 690 |
+
"<td>Términos municipales de: Aielo de Malferit, Albaida, Alborache, Manises, Marines, Pobla Llarga (La) y Riola.</td>\n",
|
| 691 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 692 |
+
"<td>0,19</td>\n",
|
| 693 |
+
"</tr>\n",
|
| 694 |
+
"<tr>\n",
|
| 695 |
+
"<td rowspan=\"2\">Términos municipales de: Albal, Alberic, Almiserà, Bellreguard, Beneixida, Cotes, Guardamar de la Safor, Massanassa y Quart de les Valls.</td>\n",
|
| 696 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 697 |
+
"<td>0,18</td>\n",
|
| 698 |
+
"</tr>\n",
|
| 699 |
+
"<tr>\n",
|
| 700 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 701 |
+
"<td>0,26</td>\n",
|
| 702 |
+
"</tr>\n",
|
| 703 |
+
"<tr>\n",
|
| 704 |
+
"<td rowspan=\"2\">Términos municipales de: Albalat de la Ribera.</td>\n",
|
| 705 |
+
"<td>Cítricos: Mandarina.</td>\n",
|
| 706 |
+
"<td>0,13</td>\n",
|
| 707 |
+
"</tr>\n",
|
| 708 |
+
"<tr>\n",
|
| 709 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 710 |
+
"<td>0,26</td>\n",
|
| 711 |
+
"</tr>\n",
|
| 712 |
+
"<tr>\n",
|
| 713 |
+
"<td rowspan=\"4\">Términos municipales de: Albalat dels Sorells.</td>\n",
|
| 714 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 715 |
+
"<td>0,09</td>\n",
|
| 716 |
+
"</tr>\n",
|
| 717 |
+
"<tr>\n",
|
| 718 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 719 |
+
"<td>0,07</td>\n",
|
| 720 |
+
"</tr>\n",
|
| 721 |
+
"<tr>\n",
|
| 722 |
+
"<td>Productos hortícolas: Alcachofa.</td>\n",
|
| 723 |
+
"<td>0,18</td>\n",
|
| 724 |
+
"</tr>\n",
|
| 725 |
+
"<tr>\n",
|
| 726 |
+
"<td>Productos hortícolas: Cebolla.</td>\n",
|
| 727 |
+
"<td>0,18</td>\n",
|
| 728 |
+
"</tr>\n",
|
| 729 |
+
"<tr>\n",
|
| 730 |
+
"<td rowspan=\"3\">Términos municipales de: Alboraia/Alboraya.</td>\n",
|
| 731 |
+
"<td>Productos hortícolas: Alcachofa.</td>\n",
|
| 732 |
+
"<td>0,09</td>\n",
|
| 733 |
+
"</tr>\n",
|
| 734 |
+
"<tr>\n",
|
| 735 |
+
"<td>Productos hortícolas: Cebolla.</td>\n",
|
| 736 |
+
"<td>0,09</td>\n",
|
| 737 |
+
"</tr>\n",
|
| 738 |
+
"<tr>\n",
|
| 739 |
+
"<td>Productos hortícolas: Haba verde.</td>\n",
|
| 740 |
+
"<td>0,13</td>\n",
|
| 741 |
+
"</tr>\n",
|
| 742 |
+
"<tr>\n",
|
| 743 |
+
"<td rowspan=\"3\">Términos municipales de: Albuixech.</td>\n",
|
| 744 |
+
"<td>C��tricos: Naranja.</td>\n",
|
| 745 |
+
"<td>0,09</td>\n",
|
| 746 |
+
"</tr>\n",
|
| 747 |
+
"<tr>\n",
|
| 748 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 749 |
+
"<td>0,19</td>\n",
|
| 750 |
+
"</tr>\n",
|
| 751 |
+
"<tr>\n",
|
| 752 |
+
"<td>Productos hortícolas: Alcachofa.</td>\n",
|
| 753 |
+
"<td>0,13</td>\n",
|
| 754 |
+
"</tr>\n",
|
| 755 |
+
"<tr>\n",
|
| 756 |
+
"<td>Términos municipales de: Alcàsser.</td>\n",
|
| 757 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 758 |
+
"<td>0,18</td>\n",
|
| 759 |
+
"</tr>\n",
|
| 760 |
+
"</table> \n",
|
| 761 |
+
"<!-- PageNumber=\"Página 816\" -->\n",
|
| 762 |
+
"<!-- PageBreak --> \n",
|
| 763 |
+
"<!-- PageHeader=\"Apéndice: Relación de productos naturales, servicios y actividades accesorios realiza\" --> \n",
|
| 764 |
+
"Provincia: Valencia \n",
|
| 765 |
+
"<table>\n",
|
| 766 |
+
"<tr>\n",
|
| 767 |
+
"<th>Ámbito territorial</th>\n",
|
| 768 |
+
"<th>Actividad</th>\n",
|
| 769 |
+
"<th>Índice rendimiento neto</th>\n",
|
| 770 |
+
"</tr>\n",
|
| 771 |
+
"<tr>\n",
|
| 772 |
+
"<td rowspan=\"4\"></td>\n",
|
| 773 |
+
"<td>Frutos no cítricos: Aguacate.</td>\n",
|
| 774 |
+
"<td>0,26</td>\n",
|
| 775 |
+
"</tr>\n",
|
| 776 |
+
"<tr>\n",
|
| 777 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 778 |
+
"<td>0,26</td>\n",
|
| 779 |
+
"</tr>\n",
|
| 780 |
+
"<tr>\n",
|
| 781 |
+
"<td>Frutos no cítricos: Granado.</td>\n",
|
| 782 |
+
"<td>0,26</td>\n",
|
| 783 |
+
"</tr>\n",
|
| 784 |
+
"<tr>\n",
|
| 785 |
+
"<td>Frutos tropicales.</td>\n",
|
| 786 |
+
"<td>0,26</td>\n",
|
| 787 |
+
"</tr>\n",
|
| 788 |
+
"<tr>\n",
|
| 789 |
+
"<td rowspan=\"5\">Términos municipales de: Alzira.</td>\n",
|
| 790 |
+
"<td>Cítricos: Mandarina.</td>\n",
|
| 791 |
+
"<td>0,18</td>\n",
|
| 792 |
+
"</tr>\n",
|
| 793 |
+
"<tr>\n",
|
| 794 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 795 |
+
"<td>0,18</td>\n",
|
| 796 |
+
"</tr>\n",
|
| 797 |
+
"<tr>\n",
|
| 798 |
+
"<td>Frutos no cítricos: Albaricoque.</td>\n",
|
| 799 |
+
"<td>0,13</td>\n",
|
| 800 |
+
"</tr>\n",
|
| 801 |
+
"<tr>\n",
|
| 802 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 803 |
+
"<td>0,26</td>\n",
|
| 804 |
+
"</tr>\n",
|
| 805 |
+
"<tr>\n",
|
| 806 |
+
"<td>Frutos no cítricos: Melocotón y nectarina.</td>\n",
|
| 807 |
+
"<td>0,26</td>\n",
|
| 808 |
+
"</tr>\n",
|
| 809 |
+
"<tr>\n",
|
| 810 |
+
"<td>Términos municipales de: Alcublas, Domeño, Eliana (L\"), Gestalgar, Gátova y Vilamarxant.</td>\n",
|
| 811 |
+
"<td>Frutos secos: Algarroba.</td>\n",
|
| 812 |
+
"<td>0,18</td>\n",
|
| 813 |
+
"</tr>\n",
|
| 814 |
+
"<tr>\n",
|
| 815 |
+
"<td rowspan=\"3\">Términos municipales de: Alcúdia (L\") y Benimodo.</td>\n",
|
| 816 |
+
"<td>Cítricos: Mandarina.</td>\n",
|
| 817 |
+
"<td>0,18</td>\n",
|
| 818 |
+
"</tr>\n",
|
| 819 |
+
"<tr>\n",
|
| 820 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 821 |
+
"<td>0,18</td>\n",
|
| 822 |
+
"</tr>\n",
|
| 823 |
+
"<tr>\n",
|
| 824 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 825 |
+
"<td>0,19</td>\n",
|
| 826 |
+
"</tr>\n",
|
| 827 |
+
"<tr>\n",
|
| 828 |
+
"<td rowspan=\"3\">Términos municipales de: Alcúdia de Crespins (L\") y Llocnou d'En Fenollet.</td>\n",
|
| 829 |
+
"<td>Cítricos: Mandarina.</td>\n",
|
| 830 |
+
"<td>0,13</td>\n",
|
| 831 |
+
"</tr>\n",
|
| 832 |
+
"<tr>\n",
|
| 833 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 834 |
+
"<td>0,09</td>\n",
|
| 835 |
+
"</tr>\n",
|
| 836 |
+
"<tr>\n",
|
| 837 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 838 |
+
"<td>0,13</td>\n",
|
| 839 |
+
"</tr>\n",
|
| 840 |
+
"<tr>\n",
|
| 841 |
+
"<td>Términos municipales de: Aldaia.</td>\n",
|
| 842 |
+
"<td>Frutos no cítricos: Albaricoque.</td>\n",
|
| 843 |
+
"<td>0,07</td>\n",
|
| 844 |
+
"</tr>\n",
|
| 845 |
+
"</table> \n",
|
| 846 |
+
"<!-- PageNumber=\"Página 817\" -->\n",
|
| 847 |
+
"<!-- PageBreak --> \n",
|
| 848 |
+
"<!-- PageHeader=\"Capítulo 9. Rendimientos de actividades económicas en estimación objetiva (II) (Acti\" --> \n",
|
| 849 |
+
"Provincia: Valencia \n",
|
| 850 |
+
"<table>\n",
|
| 851 |
+
"<tr>\n",
|
| 852 |
+
"<th>Ámbito territorial</th>\n",
|
| 853 |
+
"<th>Actividad</th>\n",
|
| 854 |
+
"<th>Índice rendimiento neto</th>\n",
|
| 855 |
+
"</tr>\n",
|
| 856 |
+
"<tr>\n",
|
| 857 |
+
"<td></td>\n",
|
| 858 |
+
"<td>Frutos no cítricos: Granado.</td>\n",
|
| 859 |
+
"<td>0,26</td>\n",
|
| 860 |
+
"</tr>\n",
|
| 861 |
+
"<tr>\n",
|
| 862 |
+
"<td>Términos municipales de: Alfafar, Alqueria de la Comtessa (L\"), Beniarjó, Beniflá, Benirredrà, Daimús y Guadassuar.</td>\n",
|
| 863 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 864 |
+
"<td>0,18</td>\n",
|
| 865 |
+
"</tr>\n",
|
| 866 |
+
"<tr>\n",
|
| 867 |
+
"<td rowspan=\"5\">Términos municipales de: Alfauir.</td>\n",
|
| 868 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 869 |
+
"<td>0,18</td>\n",
|
| 870 |
+
"</tr>\n",
|
| 871 |
+
"<tr>\n",
|
| 872 |
+
"<td>Cítricos: Pomelo.</td>\n",
|
| 873 |
+
"<td>0,18</td>\n",
|
| 874 |
+
"</tr>\n",
|
| 875 |
+
"<tr>\n",
|
| 876 |
+
"<td>Frutos no cítricos: Aguacate.</td>\n",
|
| 877 |
+
"<td>0,26</td>\n",
|
| 878 |
+
"</tr>\n",
|
| 879 |
+
"<tr>\n",
|
| 880 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 881 |
+
"<td>0,19</td>\n",
|
| 882 |
+
"</tr>\n",
|
| 883 |
+
"<tr>\n",
|
| 884 |
+
"<td>Frutos secos: Algarroba.</td>\n",
|
| 885 |
+
"<td>0,13</td>\n",
|
| 886 |
+
"</tr>\n",
|
| 887 |
+
"<tr>\n",
|
| 888 |
+
"<td rowspan=\"3\">Términos municipales de: Alfara del Patriarca.</td>\n",
|
| 889 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 890 |
+
"<td>0,09</td>\n",
|
| 891 |
+
"</tr>\n",
|
| 892 |
+
"<tr>\n",
|
| 893 |
+
"<td>Productos hortícolas: Alcachofa.</td>\n",
|
| 894 |
+
"<td>0,09</td>\n",
|
| 895 |
+
"</tr>\n",
|
| 896 |
+
"<tr>\n",
|
| 897 |
+
"<td>Productos hortícolas: Haba verde.</td>\n",
|
| 898 |
+
"<td>0,09</td>\n",
|
| 899 |
+
"</tr>\n",
|
| 900 |
+
"<tr>\n",
|
| 901 |
+
"<td rowspan=\"3\">Términos municipales de: Alfarp.</td>\n",
|
| 902 |
+
"<td>Cítricos: Mandarina.</td>\n",
|
| 903 |
+
"<td>0,13</td>\n",
|
| 904 |
+
"</tr>\n",
|
| 905 |
+
"<tr>\n",
|
| 906 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 907 |
+
"<td>0,18</td>\n",
|
| 908 |
+
"</tr>\n",
|
| 909 |
+
"<tr>\n",
|
| 910 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 911 |
+
"<td>0,13</td>\n",
|
| 912 |
+
"</tr>\n",
|
| 913 |
+
"<tr>\n",
|
| 914 |
+
"<td rowspan=\"4\">Términos municipales de: Aielo de Rugat, Alfarrasí y Terrateig.</td>\n",
|
| 915 |
+
"<td>Frutos no cítricos: Albaricoque.</td>\n",
|
| 916 |
+
"<td>0,13</td>\n",
|
| 917 |
+
"</tr>\n",
|
| 918 |
+
"<tr>\n",
|
| 919 |
+
"<td>Frutos no cítricos: Caqui.</td>\n",
|
| 920 |
+
"<td>0,19</td>\n",
|
| 921 |
+
"</tr>\n",
|
| 922 |
+
"<tr>\n",
|
| 923 |
+
"<td>Frutos no cítricos: Ciruela.</td>\n",
|
| 924 |
+
"<td>0,07</td>\n",
|
| 925 |
+
"</tr>\n",
|
| 926 |
+
"<tr>\n",
|
| 927 |
+
"<td>Frutos no cítricos: Melocotón y nectarina.</td>\n",
|
| 928 |
+
"<td>0,19</td>\n",
|
| 929 |
+
"</tr>\n",
|
| 930 |
+
"</table> \n",
|
| 931 |
+
"<!-- PageNumber=\"Página 818\" -->\n",
|
| 932 |
+
"<!-- PageBreak --> \n",
|
| 933 |
+
"<!-- PageHeader=\"Apéndice: Relación de productos naturales, servicios y actividades accesorios realiza\" --> \n",
|
| 934 |
+
"Provincia: Valencia \n",
|
| 935 |
+
"<table>\n",
|
| 936 |
+
"<tr>\n",
|
| 937 |
+
"<th>Ámbito territorial</th>\n",
|
| 938 |
+
"<th>Actividad</th>\n",
|
| 939 |
+
"<th>Índice rendimiento neto</th>\n",
|
| 940 |
+
"</tr>\n",
|
| 941 |
+
"<tr>\n",
|
| 942 |
+
"<td>Términos municipales de: Algemesí y Montserrat.</td>\n",
|
| 943 |
+
"<td>Frutos no cítricos: Albaricoque.</td>\n",
|
| 944 |
+
"<td>0,13</td>\n",
|
| 945 |
+
"</tr>\n",
|
| 946 |
+
"<tr>\n",
|
| 947 |
+
"<td rowspan=\"4\">Términos municipales de: Almàssera.</td>\n",
|
| 948 |
+
"<td>Cítricos: Naranja.</td>\n",
|
| 949 |
+
"<td>0,09</td>\n",
|
| 950 |
+
"</tr>\n",
|
| 951 |
+
"<tr>\n",
|
| 952 |
+
"Metadatos:\n",
|
| 953 |
+
"{'Header 1': 'Capítulo 9. Rendimientos de actividades económicas en estimación objetiva (II) (Actividades agrícolas, ganaderas y forestales)', 'Header 2': 'Apéndice: Relación de productos naturales, servicios y actividades accesorios realizados por agricultores, ganaderos y titulares de actividades forestales e índices de rendimiento aplicables en el ejercicio 2023', 'Header 3': 'Anexo: Reducción de los índices de rendimiento neto, aplicables en 2023, por las actividades agrícolas y ganaderas afectadas por circunstancias excepcionales agrupados por Comunidades Autónomas, Provincias, ámbitos territoriales y actividades', 'Header 4': 'Comunitat Valenciana', 'Header 5': 'Valencia', 'page': '814'}\n",
|
| 954 |
+
"----------------------------------------\n",
|
| 955 |
+
"\n",
|
| 956 |
+
"--- Chunk 4 ---\n",
|
| 957 |
+
"Contenido:\n",
|
| 958 |
+
"<!-- PageNumber=\"Página 554\" -->\n",
|
| 959 |
+
"<!-- PageBreak --> \n",
|
| 960 |
+
"<!-- PageHeader=\"Apéndice: Rendimientos anuales por unidad de módulo antes de amortización aplica\" --> \n",
|
| 961 |
+
"<table>\n",
|
| 962 |
+
"<tr>\n",
|
| 963 |
+
"<th>Módulo</th>\n",
|
| 964 |
+
"<th>Definición</th>\n",
|
| 965 |
+
"<th>Unidad</th>\n",
|
| 966 |
+
"<th>Rendimiento anual por unidad (euros)</th>\n",
|
| 967 |
+
"</tr>\n",
|
| 968 |
+
"<tr>\n",
|
| 969 |
+
"<td>1</td>\n",
|
| 970 |
+
"<td>Personal asalariado</td>\n",
|
| 971 |
+
"<td>Persona</td>\n",
|
| 972 |
+
"<td>3.476,83</td>\n",
|
| 973 |
+
"</tr>\n",
|
| 974 |
+
"<tr>\n",
|
| 975 |
+
"<td>2</td>\n",
|
| 976 |
+
"<td>Personal no asalariado</td>\n",
|
| 977 |
+
"<td>Persona</td>\n",
|
| 978 |
+
"<td>17.220,39</td>\n",
|
| 979 |
+
"</tr>\n",
|
| 980 |
+
"<tr>\n",
|
| 981 |
+
"<td>3</td>\n",
|
| 982 |
+
"<td>Consumo de energía eléctrica</td>\n",
|
| 983 |
+
"<td>100 kWh</td>\n",
|
| 984 |
+
"<td>403,11</td>\n",
|
| 985 |
+
"</tr>\n",
|
| 986 |
+
"<tr>\n",
|
| 987 |
+
"<td>4</td>\n",
|
| 988 |
+
"<td>Superficie del local</td>\n",
|
| 989 |
+
"<td>m2</td>\n",
|
| 990 |
+
"<td>844,02</td>\n",
|
| 991 |
+
"</tr>\n",
|
| 992 |
+
"</table> \n",
|
| 993 |
+
"Nota: El rendimiento neto resultante de la aplicación de los signos o módulos anteriores incluye, en su caso, el\n",
|
| 994 |
+
"derivado de la venta de artículos de escaso valor tales como dulces, artículos de fumador, etc., los servicios de\n",
|
| 995 |
+
"publicidad exterior y comercialización de tarjetas de transporte público, tarjetas para uso telefónico y otras\n",
|
| 996 |
+
"similares, así como loterías, siempre que estas actividades se desarrollen con carácter accesorio a la actividad\n",
|
| 997 |
+
"principal. \n",
|
| 998 |
+
"Departamento de Gestión Tributaria \n",
|
| 999 |
+
"Cuantía a efectos del índice corrector de exceso: 28.860,22 euros\n",
|
| 1000 |
+
"Metadatos:\n",
|
| 1001 |
+
"{'Header 1': 'Capítulo 8. Rendimientos de actividades económicas en estimación objetiva (I) (Actividades distintas de las agrícolas, ganaderas y forestales)', 'Header 2': 'Apéndice: Rendimientos anuales por unidad de módulo antes de amortización aplicables en el ejercicio 2023', 'Header 3': 'Módulos aplicables a cada una de las actividades, incluidos en el Anexo II de la Orden HFP/1172/2022, con indicación de su correspondiente epígrafe en el Impuesto de Actividades Económicas (IAE)', 'Header 4': 'Epígrafe IAE: 659.4 - Comercio al por menor de prensa, revistas y libros en quioscos situados en la vía pública', 'page': '554'}\n",
|
| 1002 |
+
"----------------------------------------\n",
|
| 1003 |
+
"\n",
|
| 1004 |
+
"--- Chunk 5 ---\n",
|
| 1005 |
+
"Contenido:\n",
|
| 1006 |
+
"La participación de las Comunidades Autónomas en la Agencia Estatal de Administración\n",
|
| 1007 |
+
"Tributaria, que constituye la organización administrativa responsable en nombre y por cuenta\n",
|
| 1008 |
+
"del Estado de la aplicación efectiva del sistema tributario estatal y del aduanero, se desarrolla\n",
|
| 1009 |
+
"a través de los siguientes órganos, regulados en los artículos 65 y 66 de la Ley 22/2009: \n",
|
| 1010 |
+
"Consejo Superior para la Dirección y Coordinación de la Gestión Tributaria \n",
|
| 1011 |
+
"El Consejo Superior para la Dirección y Coordinación de la Gestión Tributaria es el órgano\n",
|
| 1012 |
+
"colegiado, integrado por representantes de la Administración Tributaria del Estado y de las\n",
|
| 1013 |
+
"Comunidades Autónomas y Ciudades con Estatuto de Autonomía, encargado de coordinar la\n",
|
| 1014 |
+
"gestión de los tributos cedidos. \n",
|
| 1015 |
+
"Este órgano está presidido por el Presidente de la Agencia Estatal de Administración\n",
|
| 1016 |
+
"Tributaria e integrado por la Directora General de la Agencia Estatal de Administración\n",
|
| 1017 |
+
"Tributaria, que ostentará la Vicepresidencia primera, cinco representantes de la Agencia\n",
|
| 1018 |
+
"Estatal de Administración Tributaria, los titulares de la Secretaría General de Hacienda, de la\n",
|
| 1019 |
+
"Secretaría General de Financiación Territorial y de la Inspección General del Ministerio de\n",
|
| 1020 |
+
"Economía y Hacienda (actualmente, Ministerio de Hacienda) y por un representante de cada\n",
|
| 1021 |
+
"una de las Comunidades Autónomas de régimen común y de las Ciudades con Estatuto de\n",
|
| 1022 |
+
"Autonomía, uno de los cuales será designado por éstas cada año para ostentar la\n",
|
| 1023 |
+
"Vicepresidencia segunda. \n",
|
| 1024 |
+
"Aquellas Comunidades y Ciudades Autónomas que tengan encomendadas a dos órganos o\n",
|
| 1025 |
+
"entes distintos las funciones de aplicación de los tributos y las de diseño o interpretación de\n",
|
| 1026 |
+
"la normativa autonómica podrán designar dos representantes, si bien dispondrán de un solo\n",
|
| 1027 |
+
"voto. \n",
|
| 1028 |
+
"Precisión: téngase en cuenta que el artículo 1.8 del Real Decreto 352/2011, de 11 de marzo (BOE de 12 de\n",
|
| 1029 |
+
"marzo) suprimió la Secretaría General de Financiación Territorial y que el Real Decreto 1887/2011, de 30 de\n",
|
| 1030 |
+
"diciembre (BOE de 31 de diciembre), suprimió en su artículo 4.3, entre otros órganos directivos, la Secretaría\n",
|
| 1031 |
+
"General de Hacienda y la Inspección General del Ministerio de Economía y Hacienda. \n",
|
| 1032 |
+
"En la actualidad véase el Real Decreto 139/2020, de 28 de enero, por el que se establece la estructura\n",
|
| 1033 |
+
"orgánica básica de los departamentos ministeriales (BOE de 29 de enero), que incluye como órgano directivo\n",
|
| 1034 |
+
"de la Secretaría de Estado de Hacienda de la que depende la Secretaría General de Financiación Autonómica\n",
|
| 1035 |
+
"y Local y, dentro de la actual Subsecretaría de Hacienda y Función Pública, a la Inspección General. \n",
|
| 1036 |
+
"Consejos Territoriales para la Dirección y Coordinación de la Gestión Tributaria \n",
|
| 1037 |
+
"<!-- PageNumber=\"Página 95\" -->\n",
|
| 1038 |
+
"<!-- PageBreak --> \n",
|
| 1039 |
+
"<!-- PageHeader=\"Capítulo 2. El Impuesto sobre la Renta de las Personas Físicas (IRPF): cuestiones g\" --> \n",
|
| 1040 |
+
"Los Consejos Territoriales para la Dirección y Coordinación de la Gestión Tributaria son\n",
|
| 1041 |
+
"órganos colegiados integrados por representantes de la Administración Tributaria del Estado\n",
|
| 1042 |
+
"y de la Comunidad Autónoma o de la Ciudad con Estatuto de Autonomía de que se trate a los\n",
|
| 1043 |
+
"que corresponde coordinar la gestión de los tributos cedidos en su respectivo ámbito\n",
|
| 1044 |
+
"territorial. \n",
|
| 1045 |
+
"Estos consejos están compuestos por cuatro representantes de la Agencia Estatal de\n",
|
| 1046 |
+
"Administración Tributaria y cuatro de la respectiva Comunidad Autónoma o Ciudad con\n",
|
| 1047 |
+
"Estatuto de Autonomía. Existirán tantos suplentes como titulares, que actuarán en caso de\n",
|
| 1048 |
+
"ausencia o vacante de alguno de estos últimos.\n",
|
| 1049 |
+
"Metadatos:\n",
|
| 1050 |
+
"{'Header 1': 'Capítulo 2. El Impuesto sobre la Renta de las Personas Físicas (IRPF): cuestiones generales', 'Header 2': 'Cesión parcial del IRPF a las Comunidades Autónomas', 'Header 3': 'Participación de las Comunidades Autónomas y Ciudades con Estatuto de Autonomía en la gestión del IRPF', 'page': '95'}\n",
|
| 1051 |
+
"----------------------------------------\n"
|
| 1052 |
+
]
|
| 1053 |
+
}
|
| 1054 |
+
],
|
| 1055 |
+
"source": [
|
| 1056 |
+
"import chromadb\n",
|
| 1057 |
+
"from chromadb.utils.batch_utils import create_batches\n",
|
| 1058 |
+
"\n",
|
| 1059 |
+
"MARKDOWN_FILE_PATH = \"/teamspace/studios/this_studio/AgenticRAG/doc_renta/Renta_2023_doc_int_corrected_2.md\"\n",
|
| 1060 |
+
"\n",
|
| 1061 |
+
"# Procesar el archivo Markdown con MarkdownHeaderTextSplitter\n",
|
| 1062 |
+
"chunks = process_markdown_with_header_splitter(MARKDOWN_FILE_PATH)\n",
|
| 1063 |
+
"print(f\"Total chunks extraídos: {len(chunks)}\")\n",
|
| 1064 |
+
"\n",
|
| 1065 |
+
"# Realizar prueba aleatoria\n",
|
| 1066 |
+
"test_random_chunks(chunks, num_samples=5)\n",
|
| 1067 |
+
"\n",
|
| 1068 |
+
"# Añadir los chunks a ChromaDB (comentado para evitar ejecución accidental)\n",
|
| 1069 |
+
"# add_chunks_to_chroma(chunks)"
|
| 1070 |
+
]
|
| 1071 |
+
},
|
| 1072 |
+
{
|
| 1073 |
+
"cell_type": "code",
|
| 1074 |
+
"execution_count": null,
|
| 1075 |
+
"metadata": {},
|
| 1076 |
+
"outputs": [],
|
| 1077 |
+
"source": [
|
| 1078 |
+
"# results = collection.query(\n",
|
| 1079 |
+
"# query_texts=[\n",
|
| 1080 |
+
"# \"What is the embedding used in llama model?\"\n",
|
| 1081 |
+
"# ], # Also available query_embeddings to use embeddings instead of text.\n",
|
| 1082 |
+
"# n_results=2, # how many results to return\n",
|
| 1083 |
+
"# )\n",
|
| 1084 |
+
"\n",
|
| 1085 |
+
"# print(results)"
|
| 1086 |
+
]
|
| 1087 |
+
},
|
| 1088 |
+
{
|
| 1089 |
+
"cell_type": "markdown",
|
| 1090 |
+
"metadata": {},
|
| 1091 |
+
"source": [
|
| 1092 |
+
"### Code to save"
|
| 1093 |
+
]
|
| 1094 |
+
},
|
| 1095 |
+
{
|
| 1096 |
+
"cell_type": "code",
|
| 1097 |
+
"execution_count": null,
|
| 1098 |
+
"metadata": {},
|
| 1099 |
+
"outputs": [],
|
| 1100 |
+
"source": [
|
| 1101 |
+
"# from litellm import embedding\n",
|
| 1102 |
+
"\n",
|
| 1103 |
+
"# response = embedding(\n",
|
| 1104 |
+
"# model=EMBEDDING_NAME,\n",
|
| 1105 |
+
"# input=[\"test embedding\"],\n",
|
| 1106 |
+
"# api_key=EMBEDDING_API_KEY,\n",
|
| 1107 |
+
"# api_base=EMBEDDING_BASE,\n",
|
| 1108 |
+
"# api_version=EMBEDDING_VERSION,\n",
|
| 1109 |
+
"# )\n",
|
| 1110 |
+
"# print(response)"
|
| 1111 |
+
]
|
| 1112 |
+
},
|
| 1113 |
+
{
|
| 1114 |
+
"cell_type": "code",
|
| 1115 |
+
"execution_count": null,
|
| 1116 |
+
"metadata": {},
|
| 1117 |
+
"outputs": [],
|
| 1118 |
+
"source": [
|
| 1119 |
+
"# chunk_step = 250\n",
|
| 1120 |
+
"# for chunk_idx in range(0, len(chunks), chunk_step):\n",
|
| 1121 |
+
"# vector_store.add_documents(\n",
|
| 1122 |
+
"# documents=chunks[chunk_idx : chunk_idx + chunk_step],\n",
|
| 1123 |
+
"# ids=uuids[chunk_idx : chunk_idx + chunk_step],\n",
|
| 1124 |
+
"# )\n",
|
| 1125 |
+
"# print(\n",
|
| 1126 |
+
"# f\"Saved {len(chunks[chunk_idx : chunk_idx + chunk_step])} chunks to {CHROMA_PATH}.\"\n",
|
| 1127 |
+
"# )\n",
|
| 1128 |
+
"# print(len(vector_store.get()[\"documents\"]))"
|
| 1129 |
+
]
|
| 1130 |
+
}
|
| 1131 |
+
],
|
| 1132 |
+
"metadata": {
|
| 1133 |
+
"language_info": {
|
| 1134 |
+
"name": "python"
|
| 1135 |
+
}
|
| 1136 |
+
},
|
| 1137 |
+
"nbformat": 4,
|
| 1138 |
+
"nbformat_minor": 2
|
| 1139 |
+
}
|
dataset/README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
# Hugging Face Transformers documentation as markdown dataset
|
| 6 |
+
|
| 7 |
+
This dataset was created using [Clipper.js](https://github.com/philschmid/clipper.js). Clipper is a Node.js command line tool that allows you to easily clip content from web pages and convert it to Markdown. It uses Mozilla's Readability library and Turndown under the hood to parse web page content and convert it to Markdown.
|
| 8 |
+
|
| 9 |
+
This dataset can be used to create RAG applications, which want to use the transformers documentation.
|
| 10 |
+
|
| 11 |
+
Example document: https://huggingface.co/docs/transformers/peft
|
| 12 |
+
```
|
| 13 |
+
# Load adapters with 🤗 PEFT
|
| 14 |
+
|
| 15 |
+
[Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) methods freeze the pretrained model parameters during fine-tuning and add a small number of trainable parameters (the adapters) on top of it. The adapters are trained to learn task-specific information. This approach has been shown to be very memory-efficient with lower compute usage while producing results comparable to a fully fine-tuned model.
|
| 16 |
+
|
| 17 |
+
Adapters trained with PEFT are also usually an order of magnitude smaller than the full model, making it convenient to share, store, and load them.
|
| 18 |
+
|
| 19 |
+

|
| 20 |
+
|
| 21 |
+
The adapter weights for a OPTForCausalLM model stored on the Hub are only ~6MB compared to the full size of the model weights, which can be ~700MB.
|
| 22 |
+
|
| 23 |
+
If you’re interested in learning more about the 🤗 PEFT library, check out the [documentation](https://huggingface.co/docs/peft/index).
|
| 24 |
+
|
| 25 |
+
## Setup
|
| 26 |
+
|
| 27 |
+
Get started by installing 🤗 PEFT:
|
| 28 |
+
|
| 29 |
+
If you want to try out the brand new features, you might be interested in installing the library from source:
|
| 30 |
+
|
| 31 |
+
....
|
| 32 |
+
|
| 33 |
+
```
|
demo.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import constants as cte
|
| 3 |
+
from rag_smolagent import SmolAgent, MessageRole
|
| 4 |
+
from googlesearch import search
|
| 5 |
+
from typing import List, Tuple, Optional, Any
|
| 6 |
+
|
| 7 |
+
class ChatState:
|
| 8 |
+
def __init__(self) -> None:
|
| 9 |
+
self.sources: str = "There are no sources yet."
|
| 10 |
+
self.extra_info: str = ""
|
| 11 |
+
self.age: str = ""
|
| 12 |
+
self.residence: str = ""
|
| 13 |
+
self.response_type: str = "Concise" # Default response type
|
| 14 |
+
self.seen_messages: List[Any] = []
|
| 15 |
+
self.cot_steps: str = ""
|
| 16 |
+
|
| 17 |
+
state = ChatState()
|
| 18 |
+
|
| 19 |
+
# Initialize the SmolAgent with the specified data path
|
| 20 |
+
agent = SmolAgent()
|
| 21 |
+
|
| 22 |
+
def get_first_result(query: str) -> Optional[str]:
|
| 23 |
+
"""
|
| 24 |
+
Searches Google using a processed version of the query string and returns the URL of the first result.
|
| 25 |
+
"""
|
| 26 |
+
parts = query.split(",")
|
| 27 |
+
if len(parts) > 3:
|
| 28 |
+
msg = parts[-3] + parts[-2] + " manual de la renta 2023" # search only in 2023's Manual
|
| 29 |
+
else:
|
| 30 |
+
msg = parts[-2] + " manual de la renta 2023"
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
# Perform the search (updated configuration)
|
| 34 |
+
results = search(
|
| 35 |
+
msg,
|
| 36 |
+
num_results=1, # Desired number of results
|
| 37 |
+
lang="es", # Search language (optional)
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
return next(results) if results else None
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"Error: {e}")
|
| 44 |
+
return None
|
| 45 |
+
|
| 46 |
+
def process_sources(sources: str) -> str:
|
| 47 |
+
"""
|
| 48 |
+
Processes a string with multiple sources and generates Markdown links using the first search result for each source.
|
| 49 |
+
"""
|
| 50 |
+
lines: List[str] = sources.strip().split('\n')
|
| 51 |
+
results: List[str] = []
|
| 52 |
+
|
| 53 |
+
for line in lines:
|
| 54 |
+
clean_line = line.strip()
|
| 55 |
+
if not clean_line:
|
| 56 |
+
continue
|
| 57 |
+
url = get_first_result(clean_line)
|
| 58 |
+
url_str = url + "\n" if url else "Sin resultados"
|
| 59 |
+
results.append(f"[{clean_line}]({url_str})")
|
| 60 |
+
|
| 61 |
+
return '\n'.join(results)
|
| 62 |
+
|
| 63 |
+
def update_age(value: str) -> None:
|
| 64 |
+
"""Updates the age in the state and refreshes the extra info."""
|
| 65 |
+
state.age = value if value else ""
|
| 66 |
+
update_extra_info()
|
| 67 |
+
|
| 68 |
+
def update_residence(value: str) -> None:
|
| 69 |
+
"""Updates the residence in the state and refreshes the extra info."""
|
| 70 |
+
state.residence = value if value else ""
|
| 71 |
+
update_extra_info()
|
| 72 |
+
|
| 73 |
+
def update_extra_info() -> None:
|
| 74 |
+
"""Updates the extra info by combining age and residence."""
|
| 75 |
+
if state.age and state.residence:
|
| 76 |
+
state.extra_info = f"Tengo {state.age} años y resido en {state.residence}."
|
| 77 |
+
else:
|
| 78 |
+
state.extra_info = ""
|
| 79 |
+
|
| 80 |
+
def update_response_type(value: str) -> None:
|
| 81 |
+
"""Updates the response type in the state."""
|
| 82 |
+
state.response_type = value
|
| 83 |
+
|
| 84 |
+
def get_prompt(query: str, extra_info: str, response_type: str) -> str:
|
| 85 |
+
"""Injects the user's query, extra info, and response type into the prompt template."""
|
| 86 |
+
return cte.PROMPT_TEMPLATE.format(query=query, extra_info=extra_info, response_type=response_type)
|
| 87 |
+
|
| 88 |
+
def chatbot_response(message: str, history: List[Tuple[str, str]]) -> str:
|
| 89 |
+
"""Calls the agent, separates the main response, sources, and extracts CoT steps."""
|
| 90 |
+
response: str = agent(get_prompt(message, state.extra_info, state.response_type))
|
| 91 |
+
|
| 92 |
+
# Extract sources if present
|
| 93 |
+
if "Fuentes:" in response:
|
| 94 |
+
state.sources = response.split("Fuentes:")[1].strip()
|
| 95 |
+
response = response.split("Fuentes:")[0].strip()
|
| 96 |
+
else:
|
| 97 |
+
state.sources = "No sources were used for the generation of this message"
|
| 98 |
+
|
| 99 |
+
# Extract the main response
|
| 100 |
+
if "Respuesta:" in response:
|
| 101 |
+
answer: str = response.split("Respuesta:")[1].strip()
|
| 102 |
+
else:
|
| 103 |
+
answer = response
|
| 104 |
+
|
| 105 |
+
# Extract Chain of Thought (CoT) steps
|
| 106 |
+
cot_messages: List[Any] = agent.agent.write_memory_to_messages()
|
| 107 |
+
state.cot_steps = ""
|
| 108 |
+
cot_step_counter: int = 0
|
| 109 |
+
|
| 110 |
+
for message in cot_messages:
|
| 111 |
+
if (message.get('role') == MessageRole.ASSISTANT) & (message not in state.seen_messages):
|
| 112 |
+
for content in message.get('content', []):
|
| 113 |
+
if content.get('type') == 'text' and 'Thought:' in content.get('text', ''):
|
| 114 |
+
state.cot_steps += f"\n============================== Step {cot_step_counter} ==============================\n"
|
| 115 |
+
cot_step_counter += 1
|
| 116 |
+
state.cot_steps += (content['text'] + "\n")
|
| 117 |
+
state.seen_messages.append(message)
|
| 118 |
+
|
| 119 |
+
return answer.strip()
|
| 120 |
+
|
| 121 |
+
def respond(message, chat_history):
|
| 122 |
+
"""Handles user input, generates a response, and updates the chat history."""
|
| 123 |
+
chat_history.append((message, ""))
|
| 124 |
+
yield "", chat_history
|
| 125 |
+
|
| 126 |
+
chat_history[-1] = (message, "⏳ Procesando respuesta...")
|
| 127 |
+
yield "", chat_history
|
| 128 |
+
|
| 129 |
+
bot_message: str = chatbot_response(message, chat_history)
|
| 130 |
+
|
| 131 |
+
# Append sources and Chain of Thought as accordions if available
|
| 132 |
+
if state.sources != "No sources were used for the generation of this message":
|
| 133 |
+
try:
|
| 134 |
+
all_url = process_sources(state.sources)
|
| 135 |
+
except Exception:
|
| 136 |
+
all_url = state.sources
|
| 137 |
+
accordion_sources: str = (
|
| 138 |
+
"\n\n<details>\n"
|
| 139 |
+
" <summary style='color: #0090ff; cursor: pointer;'>Sources</summary>\n\n"
|
| 140 |
+
f" {all_url}\n\n"
|
| 141 |
+
"</details>"
|
| 142 |
+
)
|
| 143 |
+
bot_message += accordion_sources
|
| 144 |
+
|
| 145 |
+
if state.cot_steps:
|
| 146 |
+
accordion_cot: str = (
|
| 147 |
+
"\n\n<details>\n"
|
| 148 |
+
" <summary style='color: #0090ff; cursor: pointer;'>Chain of Thought</summary>\n\n"
|
| 149 |
+
f" {state.cot_steps}\n\n"
|
| 150 |
+
"</details>"
|
| 151 |
+
)
|
| 152 |
+
bot_message += accordion_cot
|
| 153 |
+
|
| 154 |
+
chat_history[-1] = (message, bot_message)
|
| 155 |
+
yield "", chat_history
|
| 156 |
+
|
| 157 |
+
def clear_history(chat_history: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
| 158 |
+
"""Clears the chat history and the agent's internal memory."""
|
| 159 |
+
chat_history.clear()
|
| 160 |
+
global agent
|
| 161 |
+
agent = SmolAgent()
|
| 162 |
+
state.sources = "There are no sources yet."
|
| 163 |
+
state.seen_messages.clear()
|
| 164 |
+
return chat_history
|
| 165 |
+
|
| 166 |
+
# Gradio UI setup
|
| 167 |
+
custom_theme = gr.Theme.load("/teamspace/studios/this_studio/AgenticRAG/theme.json")
|
| 168 |
+
|
| 169 |
+
with gr.Blocks(
|
| 170 |
+
fill_height=True,
|
| 171 |
+
fill_width=True,
|
| 172 |
+
theme=custom_theme
|
| 173 |
+
) as demo:
|
| 174 |
+
|
| 175 |
+
## Demo Sidebar
|
| 176 |
+
with gr.Sidebar():
|
| 177 |
+
gr.Markdown("## User Information")
|
| 178 |
+
age_input = gr.Textbox(label="Age", placeholder="Unknown", value=state.age)
|
| 179 |
+
residence_input = gr.Textbox(label="Residence", placeholder="Unknown", value=state.residence)
|
| 180 |
+
|
| 181 |
+
# Update values when fields change
|
| 182 |
+
age_input.change(update_age, inputs=age_input)
|
| 183 |
+
residence_input.change(update_residence, inputs=residence_input)
|
| 184 |
+
|
| 185 |
+
gr.Markdown("## Answer customization")
|
| 186 |
+
# Dropdown for response type
|
| 187 |
+
response_type_dropdown = gr.Dropdown(
|
| 188 |
+
label="Response Type",
|
| 189 |
+
choices=["Concise", "Detailed"],
|
| 190 |
+
value="Concise"
|
| 191 |
+
)
|
| 192 |
+
response_type_dropdown.change(update_response_type, inputs=response_type_dropdown)
|
| 193 |
+
|
| 194 |
+
# Clear History button
|
| 195 |
+
clear_button = gr.Button("Clear History", variant="secondary")
|
| 196 |
+
|
| 197 |
+
web_link = gr.HTML("<a href='https://sede.agenciatributaria.gob.es/Sede/Ayuda/23Manual/100.html' target='_blank'>Abrir página web</a>")
|
| 198 |
+
|
| 199 |
+
gr.Markdown("# RAG ChatBot Agent")
|
| 200 |
+
|
| 201 |
+
with gr.Column(scale=1):
|
| 202 |
+
chatbot = gr.Chatbot(
|
| 203 |
+
label="Chat",
|
| 204 |
+
bubble_full_width=True,
|
| 205 |
+
scale=1,
|
| 206 |
+
avatar_images=(None, "https://logosandtypes.com/wp-content/uploads/2022/03/cognizant.svg"),
|
| 207 |
+
group_consecutive_messages=False
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
msg = gr.Textbox(
|
| 211 |
+
show_label=False,
|
| 212 |
+
placeholder="What do you want to know?",
|
| 213 |
+
submit_btn=True
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
# Submit and clear history functionality
|
| 217 |
+
msg.submit(respond, [msg, chatbot], [msg, chatbot])
|
| 218 |
+
clear_button.click(clear_history, inputs=chatbot, outputs=chatbot)
|
| 219 |
+
|
| 220 |
+
demo.launch(share=True)
|
doc_renta/Renta_2023_doc_int.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
doc_renta/Renta_2023_doc_int_corrected.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
doc_renta/Renta_2023_doc_int_corrected_2.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
doc_renta/indice.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
doc_renta/parser.ipynb
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Parse PDF with Document Intelligence"
|
| 8 |
+
]
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"cell_type": "code",
|
| 12 |
+
"execution_count": 4,
|
| 13 |
+
"metadata": {},
|
| 14 |
+
"outputs": [],
|
| 15 |
+
"source": [
|
| 16 |
+
"from azure.core.credentials import AzureKeyCredential\n",
|
| 17 |
+
"from azure.ai.documentintelligence import DocumentIntelligenceClient\n",
|
| 18 |
+
"from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat, AnalyzeResult"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "code",
|
| 23 |
+
"execution_count": 5,
|
| 24 |
+
"metadata": {},
|
| 25 |
+
"outputs": [],
|
| 26 |
+
"source": [
|
| 27 |
+
"AZURE_ENDPOINT = \"https://document-intelligence-rag-spai.cognitiveservices.azure.com/\"\n",
|
| 28 |
+
"AZURE_KEY = \"9bBfqC0z9P5GqEEBwESzwD4KCPoSDR7Io3k7NSJHWSYnDfC6YbStJQQJ99BBACYeBjFXJ3w3AAALACOG6nXk\"\n",
|
| 29 |
+
"PDF_PATH = \"/teamspace/studios/this_studio/AgenticRAG/papers/ManualRenta2023_es_es.pdf\" \n",
|
| 30 |
+
"OUTPUT_MD = \"Renta_2023_doc_int.md\""
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "code",
|
| 35 |
+
"execution_count": 3,
|
| 36 |
+
"metadata": {},
|
| 37 |
+
"outputs": [],
|
| 38 |
+
"source": [
|
| 39 |
+
"document_intelligence_client = DocumentIntelligenceClient(endpoint=AZURE_ENDPOINT, credential=AzureKeyCredential(AZURE_KEY))\n",
|
| 40 |
+
"with open(PDF_PATH, \"rb\") as f:\n",
|
| 41 |
+
" poller = document_intelligence_client.begin_analyze_document(\n",
|
| 42 |
+
" \"prebuilt-layout\",\n",
|
| 43 |
+
" f,\n",
|
| 44 |
+
" output_content_format=DocumentContentFormat.MARKDOWN,\n",
|
| 45 |
+
")\n",
|
| 46 |
+
"result: AnalyzeResult = poller.result()"
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"cell_type": "code",
|
| 51 |
+
"execution_count": 6,
|
| 52 |
+
"metadata": {},
|
| 53 |
+
"outputs": [],
|
| 54 |
+
"source": [
|
| 55 |
+
"with open(OUTPUT_MD, 'w', encoding=\"utf-8\") as f:\n",
|
| 56 |
+
" f.write(result.content)"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"cell_type": "markdown",
|
| 61 |
+
"metadata": {},
|
| 62 |
+
"source": [
|
| 63 |
+
"# Extract PDF Table of Contents"
|
| 64 |
+
]
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"cell_type": "code",
|
| 68 |
+
"execution_count": 2,
|
| 69 |
+
"metadata": {},
|
| 70 |
+
"outputs": [],
|
| 71 |
+
"source": [
|
| 72 |
+
"# Bash\n",
|
| 73 |
+
"\n",
|
| 74 |
+
"# ```mutool show ManualRenta2023_es_es.pdf outline > indice.txt``\n"
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"cell_type": "markdown",
|
| 79 |
+
"metadata": {},
|
| 80 |
+
"source": [
|
| 81 |
+
"_Una vez extraido el TOC con MuPDF tool se ha guardado en indice.txt y parseado a indice.md_"
|
| 82 |
+
]
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"cell_type": "markdown",
|
| 86 |
+
"metadata": {},
|
| 87 |
+
"source": [
|
| 88 |
+
"## Parse Titles"
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"cell_type": "markdown",
|
| 93 |
+
"metadata": {},
|
| 94 |
+
"source": [
|
| 95 |
+
"_Limpieza de forma manual algunos headers y titulos in indice.md y Renta_2023_doc_int.md_"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "code",
|
| 100 |
+
"execution_count": null,
|
| 101 |
+
"metadata": {},
|
| 102 |
+
"outputs": [],
|
| 103 |
+
"source": [
|
| 104 |
+
"import re\n",
|
| 105 |
+
"import difflib\n",
|
| 106 |
+
"import unicodedata\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"def normalize_text(text):\n",
|
| 109 |
+
" \"\"\"\n",
|
| 110 |
+
" Normaliza el texto:\n",
|
| 111 |
+
" - Reemplaza backslashes, saltos de línea y exceso de espacios.\n",
|
| 112 |
+
" - Elimina acentos y pasa el texto a minúsculas.\n",
|
| 113 |
+
" - Elimina casi toda la puntuación (se conserva el punto).\n",
|
| 114 |
+
" \"\"\"\n",
|
| 115 |
+
" text = text.replace('\\\\', ' ')\n",
|
| 116 |
+
" text = text.replace('\\n', ' ')\n",
|
| 117 |
+
" text = text.strip()\n",
|
| 118 |
+
" text = re.sub(r'\\s+', ' ', text)\n",
|
| 119 |
+
" text = unicodedata.normalize('NFD', text)\n",
|
| 120 |
+
" text = ''.join(ch for ch in text if unicodedata.category(ch) != 'Mn')\n",
|
| 121 |
+
" text = text.lower()\n",
|
| 122 |
+
" text = re.sub(r'[^\\w\\s\\.]', '', text)\n",
|
| 123 |
+
" return text.strip()\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"def extract_headers_list(file_path):\n",
|
| 126 |
+
" \"\"\"\n",
|
| 127 |
+
" Extrae los headers de un archivo y devuelve una lista de tuplas:\n",
|
| 128 |
+
" (nivel, header_original, header_normalizado)\n",
|
| 129 |
+
" \"\"\"\n",
|
| 130 |
+
" headers = []\n",
|
| 131 |
+
" header_regex = re.compile(r'^(#+)\\s*(.+)$')\n",
|
| 132 |
+
" with open(file_path, 'r', encoding='utf-8') as f:\n",
|
| 133 |
+
" for line in f:\n",
|
| 134 |
+
" line = line.rstrip(\"\\n\")\n",
|
| 135 |
+
" if line.strip().startswith('#'):\n",
|
| 136 |
+
" m = header_regex.match(line.strip())\n",
|
| 137 |
+
" if m:\n",
|
| 138 |
+
" level = len(m.group(1))\n",
|
| 139 |
+
" header_text = m.group(2).strip()\n",
|
| 140 |
+
" norm = normalize_text(header_text)\n",
|
| 141 |
+
" headers.append((level, header_text, norm))\n",
|
| 142 |
+
" return headers\n",
|
| 143 |
+
"\n",
|
| 144 |
+
"# ejecuta hasta que todos los headers coincidan\n",
|
| 145 |
+
"while True:\n",
|
| 146 |
+
" # 1. Extraer headers de indice.md y del documento.\n",
|
| 147 |
+
" indice_headers = extract_headers_list('indice.md')\n",
|
| 148 |
+
" corrected_headers = extract_headers_list('Renta_2023_doc_int_corrected.md')\n",
|
| 149 |
+
" \n",
|
| 150 |
+
" # Convertir a listas de cadenas con formato uniforme (ej. \"## Título\")\n",
|
| 151 |
+
" indice_headers_str = [f\"{'#'*level} {text}\" for (level, text, norm) in indice_headers]\n",
|
| 152 |
+
" corrected_headers_str = [f\"{'#'*level} {text}\" for (level, text, norm) in corrected_headers]\n",
|
| 153 |
+
" \n",
|
| 154 |
+
" # Comprobar si coinciden exactamente\n",
|
| 155 |
+
" if indice_headers_str == corrected_headers_str:\n",
|
| 156 |
+
" print(\"¡Todos los headers coinciden exactamente!\")\n",
|
| 157 |
+
" break\n",
|
| 158 |
+
"\n",
|
| 159 |
+
" # 2. Determinar el primer header diferente\n",
|
| 160 |
+
" min_len = min(len(indice_headers_str), len(corrected_headers_str))\n",
|
| 161 |
+
" first_diff = None\n",
|
| 162 |
+
" for i in range(min_len):\n",
|
| 163 |
+
" if indice_headers_str[i] != corrected_headers_str[i]:\n",
|
| 164 |
+
" first_diff = i\n",
|
| 165 |
+
" break\n",
|
| 166 |
+
" if first_diff is None:\n",
|
| 167 |
+
" print(\"Las listas de headers tienen distinta longitud.\")\n",
|
| 168 |
+
" break\n",
|
| 169 |
+
"\n",
|
| 170 |
+
" # 3. Abrir el documento y dividirlo en parrafos\n",
|
| 171 |
+
" with open('Renta_2023_doc_int_corrected.md', 'r', encoding='utf-8') as f:\n",
|
| 172 |
+
" doc_text = f.read()\n",
|
| 173 |
+
" paragraphs = re.split(r'\\n\\s*\\n', doc_text)\n",
|
| 174 |
+
" \n",
|
| 175 |
+
" # 4. Obtener los indices de parrafos que son headers\n",
|
| 176 |
+
" header_paragraph_indices = [idx for idx, para in enumerate(paragraphs) if para.strip().startswith('#')]\n",
|
| 177 |
+
" \n",
|
| 178 |
+
" # Definir el rango de búsqueda en los párrafos para el header a corregir\n",
|
| 179 |
+
" if first_diff == 0:\n",
|
| 180 |
+
" search_start = 0\n",
|
| 181 |
+
" else:\n",
|
| 182 |
+
" search_start = header_paragraph_indices[first_diff - 1] + 1 if first_diff - 1 < len(header_paragraph_indices) else 0\n",
|
| 183 |
+
" if first_diff < len(header_paragraph_indices) - 1:\n",
|
| 184 |
+
" search_end = header_paragraph_indices[first_diff + 1]\n",
|
| 185 |
+
" else:\n",
|
| 186 |
+
" search_end = len(paragraphs)\n",
|
| 187 |
+
" \n",
|
| 188 |
+
" expected_header = indice_headers_str[first_diff] # Header correcto esperado (con hashes correctos)\n",
|
| 189 |
+
" expected_level = indice_headers[first_diff][0] # Nivel correcto\n",
|
| 190 |
+
" expected_norm = normalize_text(indice_headers[first_diff][1])\n",
|
| 191 |
+
" \n",
|
| 192 |
+
" # 5. Buscar en el rango definido la aparición del header esperado (fuzzy matching)\n",
|
| 193 |
+
" found_candidate_idx = None\n",
|
| 194 |
+
" threshold_candidate = 0.98 # similitud\n",
|
| 195 |
+
" for i in range(search_start, search_end):\n",
|
| 196 |
+
" candidate = paragraphs[i].strip()\n",
|
| 197 |
+
" candidate_text = candidate.lstrip('#').strip() if candidate.startswith('#') else candidate\n",
|
| 198 |
+
" norm_candidate = normalize_text(candidate_text)\n",
|
| 199 |
+
" ratio = difflib.SequenceMatcher(None, expected_norm, norm_candidate).ratio()\n",
|
| 200 |
+
" if ratio >= threshold_candidate:\n",
|
| 201 |
+
" found_candidate_idx = i\n",
|
| 202 |
+
" break\n",
|
| 203 |
+
"\n",
|
| 204 |
+
" if found_candidate_idx is not None:\n",
|
| 205 |
+
" # actualizar el párrafo encontrado con el header correcto\n",
|
| 206 |
+
" new_header_line = ('#' * expected_level) + \" \" + indice_headers[first_diff][1]\n",
|
| 207 |
+
" paragraphs[found_candidate_idx] = new_header_line\n",
|
| 208 |
+
" with open('Renta_2023_doc_int_corrected_2.md', 'w', encoding='utf-8') as f:\n",
|
| 209 |
+
" f.write(\"\\n\\n\".join(paragraphs))\n",
|
| 210 |
+
" print(f\"Se ha corregido el header en el índice {first_diff}.\")\n",
|
| 211 |
+
" else:\n",
|
| 212 |
+
" print(\"No se encontró en el rango de búsqueda el header esperado.\")\n",
|
| 213 |
+
" break\n"
|
| 214 |
+
]
|
| 215 |
+
},
|
| 216 |
+
{
|
| 217 |
+
"cell_type": "markdown",
|
| 218 |
+
"metadata": {},
|
| 219 |
+
"source": [
|
| 220 |
+
"## Delete figures and PageFooter"
|
| 221 |
+
]
|
| 222 |
+
},
|
| 223 |
+
{
|
| 224 |
+
"cell_type": "code",
|
| 225 |
+
"execution_count": null,
|
| 226 |
+
"metadata": {},
|
| 227 |
+
"outputs": [],
|
| 228 |
+
"source": [
|
| 229 |
+
"with open('Renta_2023_doc_int_corrected_2.md', 'r', encoding='utf-8') as f:\n",
|
| 230 |
+
" new_doc = f.read()\n",
|
| 231 |
+
"\n",
|
| 232 |
+
"new_doc = re.sub(r'<figure>\\s*Agencia Tributaria\\s*</figure>', '', new_doc, flags=re.DOTALL)\n",
|
| 233 |
+
"new_doc = new_doc.replace('', '')\n",
|
| 234 |
+
"\n",
|
| 235 |
+
"with open('Renta_2023_doc_int_corrected_2.md', 'w', encoding='utf-8') as f:\n",
|
| 236 |
+
" f.write(new_doc)\n",
|
| 237 |
+
"\n",
|
| 238 |
+
"print(\"Archivo corregido y guardado.\")"
|
| 239 |
+
]
|
| 240 |
+
},
|
| 241 |
+
{
|
| 242 |
+
"cell_type": "markdown",
|
| 243 |
+
"metadata": {},
|
| 244 |
+
"source": [
|
| 245 |
+
"Check Titles"
|
| 246 |
+
]
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
"cell_type": "code",
|
| 250 |
+
"execution_count": 1,
|
| 251 |
+
"metadata": {},
|
| 252 |
+
"outputs": [
|
| 253 |
+
{
|
| 254 |
+
"name": "stdout",
|
| 255 |
+
"output_type": "stream",
|
| 256 |
+
"text": [
|
| 257 |
+
"Conteo de títulos:\n",
|
| 258 |
+
"{'correctos': 1457, 'incorrectos': 0}\n",
|
| 259 |
+
"\n",
|
| 260 |
+
"Títulos incorrectos:\n"
|
| 261 |
+
]
|
| 262 |
+
}
|
| 263 |
+
],
|
| 264 |
+
"source": [
|
| 265 |
+
"import re\n",
|
| 266 |
+
"\n",
|
| 267 |
+
"def analizar_titulos_markdown_desde_archivos(archivo_contenido, archivo_titulos):\n",
|
| 268 |
+
" try:\n",
|
| 269 |
+
" with open(archivo_contenido, 'r', encoding='utf-8') as f:\n",
|
| 270 |
+
" contenido_markdown = f.read()\n",
|
| 271 |
+
" except FileNotFoundError:\n",
|
| 272 |
+
" print(f\"Error: No se encontró el archivo del contenido: {archivo_contenido}\")\n",
|
| 273 |
+
" return {\"correctos\": 0, \"incorrectos\": 0}, []\n",
|
| 274 |
+
" except Exception as e:\n",
|
| 275 |
+
" print(f\"Error al leer el archivo del contenido: {archivo_contenido} - {e}\")\n",
|
| 276 |
+
" return {\"correctos\": 0, \"incorrectos\": 0}, []\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" try:\n",
|
| 279 |
+
" with open(archivo_titulos, 'r', encoding='utf-8') as f:\n",
|
| 280 |
+
" listado_titulos_markdown = f.read()\n",
|
| 281 |
+
" except FileNotFoundError:\n",
|
| 282 |
+
" print(f\"Error: No se encontró el archivo de títulos: {archivo_titulos}\")\n",
|
| 283 |
+
" return {\"correctos\": 0, \"incorrectos\": 0}, []\n",
|
| 284 |
+
" except Exception as e:\n",
|
| 285 |
+
" print(f\"Error al leer el archivo de títulos: {archivo_titulos} - {e}\")\n",
|
| 286 |
+
" return {\"correctos\": 0, \"incorrectos\": 0}, []\n",
|
| 287 |
+
"\n",
|
| 288 |
+
" # 1. Preparar el listado de títulos esperado\n",
|
| 289 |
+
" titulos_esperados = []\n",
|
| 290 |
+
" for linea in listado_titulos_markdown.splitlines():\n",
|
| 291 |
+
" linea = linea.strip()\n",
|
| 292 |
+
" if linea: # Ignorar líneas vacías\n",
|
| 293 |
+
" titulos_esperados.append(linea)\n",
|
| 294 |
+
"\n",
|
| 295 |
+
" # 2. Extraer los títulos del contenido del documento\n",
|
| 296 |
+
" titulos_encontrados = []\n",
|
| 297 |
+
" for linea in re.findall(r\"^(#+ .+)$\", contenido_markdown, re.MULTILINE):\n",
|
| 298 |
+
" titulos_encontrados.append(linea)\n",
|
| 299 |
+
" \n",
|
| 300 |
+
" # 3. Comparar los títulos y contar correctos/incorrectos\n",
|
| 301 |
+
" correctos = 0\n",
|
| 302 |
+
" incorrectos = 0\n",
|
| 303 |
+
" titulos_incorrectos = []\n",
|
| 304 |
+
" \n",
|
| 305 |
+
" i = 0\n",
|
| 306 |
+
" j = 0\n",
|
| 307 |
+
" while i < len(titulos_esperados) and j < len(titulos_encontrados):\n",
|
| 308 |
+
" if titulos_esperados[i] == titulos_encontrados[j]:\n",
|
| 309 |
+
" correctos += 1\n",
|
| 310 |
+
" i += 1\n",
|
| 311 |
+
" j += 1\n",
|
| 312 |
+
" else:\n",
|
| 313 |
+
" incorrectos += 1\n",
|
| 314 |
+
" titulos_incorrectos.append({\n",
|
| 315 |
+
" \"esperado\": titulos_esperados[i],\n",
|
| 316 |
+
" \"encontrado\": titulos_encontrados[j]\n",
|
| 317 |
+
" })\n",
|
| 318 |
+
" i+=1\n",
|
| 319 |
+
" j+=1\n",
|
| 320 |
+
" \n",
|
| 321 |
+
" # Si quedan títulos por verificar, se consideran incorrectos\n",
|
| 322 |
+
" while i < len(titulos_esperados):\n",
|
| 323 |
+
" incorrectos += 1\n",
|
| 324 |
+
" titulos_incorrectos.append({\n",
|
| 325 |
+
" \"esperado\": titulos_esperados[i],\n",
|
| 326 |
+
" \"encontrado\": None \n",
|
| 327 |
+
" })\n",
|
| 328 |
+
" i+=1\n",
|
| 329 |
+
" \n",
|
| 330 |
+
" while j < len(titulos_encontrados):\n",
|
| 331 |
+
" incorrectos += 1\n",
|
| 332 |
+
" titulos_incorrectos.append({\n",
|
| 333 |
+
" \"esperado\": None,\n",
|
| 334 |
+
" \"encontrado\": titulos_encontrados[j] \n",
|
| 335 |
+
" })\n",
|
| 336 |
+
" j+=1\n",
|
| 337 |
+
" \n",
|
| 338 |
+
" return {\"correctos\": correctos, \"incorrectos\": incorrectos}, titulos_incorrectos\n",
|
| 339 |
+
"\n",
|
| 340 |
+
"archivo_documento = \"/teamspace/studios/this_studio/AgenticRAG/doc_renta/Renta_2023_doc_int_corrected_2.md\" \n",
|
| 341 |
+
"archivo_lista_titulos = \"/teamspace/studios/this_studio/AgenticRAG/doc_renta/indice.md\" \n",
|
| 342 |
+
"\n",
|
| 343 |
+
"resultados, incorrectos = analizar_titulos_markdown_desde_archivos(archivo_documento, archivo_lista_titulos)\n",
|
| 344 |
+
"\n",
|
| 345 |
+
"print(\"Conteo de títulos:\")\n",
|
| 346 |
+
"print(resultados)\n",
|
| 347 |
+
"\n",
|
| 348 |
+
"print(\"\\nTítulos incorrectos:\")\n",
|
| 349 |
+
"for error in incorrectos:\n",
|
| 350 |
+
" print(f\"- Esperado: {error['esperado']}, Encontrado: {error['encontrado']}\")"
|
| 351 |
+
]
|
| 352 |
+
}
|
| 353 |
+
],
|
| 354 |
+
"metadata": {
|
| 355 |
+
"language_info": {
|
| 356 |
+
"name": "python"
|
| 357 |
+
}
|
| 358 |
+
},
|
| 359 |
+
"nbformat": 4,
|
| 360 |
+
"nbformat_minor": 2
|
| 361 |
+
}
|
docs/images/AgenticRAG.png
ADDED
|
docs/images/naive_rag.jpg
ADDED
|
papers/A_Survey_of_Large_Language_Models.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ee593c3c9a1840d3fe0d289a124657624b545d62efd3f2ecd265cc1ee16d9af
|
| 3 |
+
size 6088239
|
papers/BERT.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5692a5514787a8c6727b4ff3b726a3385798bc68e12138d1d4af83947e2acf6e
|
| 3 |
+
size 775166
|
papers/Language_Models_are_Few_Shot_Learners.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97fd272f1fdfc18677462d0292f5fbf26ca86b4d1b485c2dba03269b643a0e83
|
| 3 |
+
size 6768044
|
papers/Large_Language_Models_Survey.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:653756902d04dd01004561c0216c08c5cea8c1c64369390ebefa1747850a0867
|
| 3 |
+
size 4871171
|