Spaces:
Build error
Build error
Resolved merge conflict in README.md
Browse files- .dockerignore +23 -0
- .gitignore +255 -0
- .gradio/certificate.pem +31 -0
- Data/__init__.py +0 -0
- Data/get_video_link.py +152 -0
- Data/new_video_added.py +22 -0
- Data/yt_transcript.py +94 -0
- Dockerfile +49 -0
- Example/__init__.py +0 -0
- Example/rag_example.py +18 -0
- Llm/__init__.py +0 -0
- Llm/llm_endpoints.py +14 -0
- Prompts/__init__.py +0 -0
- Prompts/huberman_prompt.py +20 -0
- Prompts/summary_prompt.py +10 -0
- README.md +99 -0
- Rag/Processed_folder/processed_files.json +1 -0
- Rag/__init__.py +0 -0
- Rag/rag_pipeline.py +183 -0
- poetry.lock +0 -0
- pyproject.toml +34 -0
- requirements.in +18 -0
- requirements.txt +20 -0
- setup.sh +10 -0
- ui/__init__.py +0 -0
- ui/app.py +147 -0
- utils/__init__.py +0 -0
- utils/corefrence.py +52 -0
- utils/get_link.py +11 -0
- utils/summarization.py +14 -0
.dockerignore
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore version control
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
|
| 5 |
+
# Ignore notebooks
|
| 6 |
+
Notebook/
|
| 7 |
+
|
| 8 |
+
# Ignore databases and logs
|
| 9 |
+
**/*.db
|
| 10 |
+
**/*.sqlite3
|
| 11 |
+
**/chromadb.db
|
| 12 |
+
**/error_log.txt
|
| 13 |
+
|
| 14 |
+
# Ignore cache
|
| 15 |
+
**/__pycache__/
|
| 16 |
+
**/*.pyc
|
| 17 |
+
**/*.pyo
|
| 18 |
+
**/*.pyd
|
| 19 |
+
|
| 20 |
+
# Ignore environment files
|
| 21 |
+
.env
|
| 22 |
+
venv/
|
| 23 |
+
.venv/
|
.gitignore
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tmp
|
| 2 |
+
.idea
|
| 3 |
+
models
|
| 4 |
+
|
| 5 |
+
stanford-ner-2015-04-20.zip
|
| 6 |
+
stanford-ner-2015-04-20
|
| 7 |
+
*.pyc
|
| 8 |
+
|
| 9 |
+
### Python template
|
| 10 |
+
# Byte-compiled / optimized / DLL files
|
| 11 |
+
__pycache__/
|
| 12 |
+
*.py[cod]
|
| 13 |
+
*$py.class
|
| 14 |
+
|
| 15 |
+
# C extensions
|
| 16 |
+
*.so
|
| 17 |
+
|
| 18 |
+
# Distribution / packaging
|
| 19 |
+
.Python
|
| 20 |
+
build/
|
| 21 |
+
develop-eggs/
|
| 22 |
+
dist/
|
| 23 |
+
downloads/
|
| 24 |
+
eggs/
|
| 25 |
+
.eggs/
|
| 26 |
+
lib/
|
| 27 |
+
lib64/
|
| 28 |
+
parts/
|
| 29 |
+
sdist/
|
| 30 |
+
var/
|
| 31 |
+
wheels/
|
| 32 |
+
pip-wheel-metadata/
|
| 33 |
+
share/python-wheels/
|
| 34 |
+
*.egg-info/
|
| 35 |
+
.installed.cfg
|
| 36 |
+
*.egg
|
| 37 |
+
MANIFEST
|
| 38 |
+
|
| 39 |
+
# PyInstaller
|
| 40 |
+
# Usually these files are written by a python script from a template
|
| 41 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 42 |
+
*.manifest
|
| 43 |
+
*.spec
|
| 44 |
+
|
| 45 |
+
# Installer logs
|
| 46 |
+
pip-log.txt
|
| 47 |
+
pip-delete-this-directory.txt
|
| 48 |
+
|
| 49 |
+
# Unit test / coverage reports
|
| 50 |
+
htmlcov/
|
| 51 |
+
.tox/
|
| 52 |
+
.nox/
|
| 53 |
+
.coverage
|
| 54 |
+
.coverage.*
|
| 55 |
+
.cache
|
| 56 |
+
nosetests.xml
|
| 57 |
+
coverage.xml
|
| 58 |
+
*.cover
|
| 59 |
+
.hypothesis/
|
| 60 |
+
.pytest_cache/
|
| 61 |
+
|
| 62 |
+
# Translations
|
| 63 |
+
*.mo
|
| 64 |
+
*.pot
|
| 65 |
+
|
| 66 |
+
service.log.*
|
| 67 |
+
|
| 68 |
+
# Django stuff:
|
| 69 |
+
*.log
|
| 70 |
+
local_settings.py
|
| 71 |
+
db.sqlite3
|
| 72 |
+
db.sqlite3-journal
|
| 73 |
+
|
| 74 |
+
# Flask stuff:
|
| 75 |
+
instance/
|
| 76 |
+
.webassets-cache
|
| 77 |
+
|
| 78 |
+
# Scrapy stuff:
|
| 79 |
+
.scrapy
|
| 80 |
+
|
| 81 |
+
# Sphinx documentation
|
| 82 |
+
docs/_build/
|
| 83 |
+
|
| 84 |
+
# PyBuilder
|
| 85 |
+
target/
|
| 86 |
+
|
| 87 |
+
# Jupyter Notebook
|
| 88 |
+
.ipynb_checkpoints
|
| 89 |
+
|
| 90 |
+
# IPython
|
| 91 |
+
profile_default/
|
| 92 |
+
ipython_config.py
|
| 93 |
+
|
| 94 |
+
# pyenv
|
| 95 |
+
.python-version
|
| 96 |
+
|
| 97 |
+
# pipenv
|
| 98 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 99 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 100 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 101 |
+
# install all needed dependencies.
|
| 102 |
+
#Pipfile.lock
|
| 103 |
+
|
| 104 |
+
# celery beat schedule file
|
| 105 |
+
celerybeat-schedule
|
| 106 |
+
|
| 107 |
+
# SageMath parsed files
|
| 108 |
+
*.sage.py
|
| 109 |
+
|
| 110 |
+
# Environments
|
| 111 |
+
.env
|
| 112 |
+
.venv
|
| 113 |
+
env/
|
| 114 |
+
venv/
|
| 115 |
+
ENV/
|
| 116 |
+
env.bak/
|
| 117 |
+
venv.bak/
|
| 118 |
+
Data/transcripts/
|
| 119 |
+
Data/videolinks/
|
| 120 |
+
Rag/db/
|
| 121 |
+
Rag/db/chroma.sqlite3
|
| 122 |
+
Rag/chromadb.db/
|
| 123 |
+
# Spyder project settings
|
| 124 |
+
.spyderproject
|
| 125 |
+
.spyproject
|
| 126 |
+
|
| 127 |
+
# Rope project settings
|
| 128 |
+
.ropeproject
|
| 129 |
+
|
| 130 |
+
# mkdocs documentation
|
| 131 |
+
/site
|
| 132 |
+
__pycache__/
|
| 133 |
+
*.pyc
|
| 134 |
+
*.pyo
|
| 135 |
+
*.pyd
|
| 136 |
+
.env
|
| 137 |
+
# mypy
|
| 138 |
+
.mypy_cache/
|
| 139 |
+
.dmypy.json
|
| 140 |
+
dmypy.json
|
| 141 |
+
|
| 142 |
+
# Pyre type checker
|
| 143 |
+
.pyre/
|
| 144 |
+
|
| 145 |
+
### JetBrains template
|
| 146 |
+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
|
| 147 |
+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
| 148 |
+
|
| 149 |
+
# User-specific stuff
|
| 150 |
+
.idea/**/workspace.xml
|
| 151 |
+
.idea/**/tasks.xml
|
| 152 |
+
.idea/**/usage.statistics.xml
|
| 153 |
+
.idea/**/dictionaries
|
| 154 |
+
.idea/**/shelf
|
| 155 |
+
|
| 156 |
+
# Generated files
|
| 157 |
+
.idea/**/contentModel.xml
|
| 158 |
+
|
| 159 |
+
# Sensitive or high-churn files
|
| 160 |
+
.idea/**/dataSources/
|
| 161 |
+
.idea/**/dataSources.ids
|
| 162 |
+
.idea/**/dataSources.local.xml
|
| 163 |
+
.idea/**/sqlDataSources.xml
|
| 164 |
+
.idea/**/dynamic.xml
|
| 165 |
+
.idea/**/uiDesigner.xml
|
| 166 |
+
.idea/**/dbnavigator.xml
|
| 167 |
+
|
| 168 |
+
# Gradle
|
| 169 |
+
.idea/**/gradle.xml
|
| 170 |
+
.idea/**/libraries
|
| 171 |
+
|
| 172 |
+
# Gradle and Maven with auto-import
|
| 173 |
+
# When using Gradle or Maven with auto-import, you should exclude module files,
|
| 174 |
+
# since they will be recreated, and may cause churn. Uncomment if using
|
| 175 |
+
# auto-import.
|
| 176 |
+
# .idea/modules.xml
|
| 177 |
+
# .idea/*.iml
|
| 178 |
+
# .idea/modules
|
| 179 |
+
# *.iml
|
| 180 |
+
# *.ipr
|
| 181 |
+
|
| 182 |
+
# CMake
|
| 183 |
+
cmake-build-*/
|
| 184 |
+
|
| 185 |
+
#
|
| 186 |
+
Mongo Explorer plugin
|
| 187 |
+
.idea/**/mongoSettings.xml
|
| 188 |
+
|
| 189 |
+
# File-based project format
|
| 190 |
+
*.iws
|
| 191 |
+
|
| 192 |
+
# IntelliJ
|
| 193 |
+
out/
|
| 194 |
+
|
| 195 |
+
# mpeltonen/sbt-idea plugin
|
| 196 |
+
.idea_modules/
|
| 197 |
+
|
| 198 |
+
# JIRA plugin
|
| 199 |
+
atlassian-ide-plugin.xml
|
| 200 |
+
|
| 201 |
+
# Cursive Clojure plugin
|
| 202 |
+
.idea/replstate.xml
|
| 203 |
+
|
| 204 |
+
# Crashlytics plugin (for Android Studio and IntelliJ)
|
| 205 |
+
com_crashlytics_export_strings.xml
|
| 206 |
+
crashlytics.properties
|
| 207 |
+
crashlytics-build.properties
|
| 208 |
+
fabric.properties
|
| 209 |
+
|
| 210 |
+
# Editor-based Rest Client
|
| 211 |
+
.idea/httpRequests
|
| 212 |
+
|
| 213 |
+
# Android studio 3.1+ serialized cache file
|
| 214 |
+
.idea/caches/build_file_checksums.ser
|
| 215 |
+
|
| 216 |
+
### VirtualEnv template
|
| 217 |
+
# Virtualenv
|
| 218 |
+
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
|
| 219 |
+
.Python
|
| 220 |
+
[Bb]in
|
| 221 |
+
[Ii]nclude
|
| 222 |
+
[Ll]ib
|
| 223 |
+
[Ll]ib64
|
| 224 |
+
[Ll]ocal
|
| 225 |
+
[Ss]cripts
|
| 226 |
+
pyvenv.cfg
|
| 227 |
+
.venv
|
| 228 |
+
pip-selfcheck.json
|
| 229 |
+
|
| 230 |
+
files
|
| 231 |
+
Files
|
| 232 |
+
*.tmp
|
| 233 |
+
.vscode
|
| 234 |
+
my_virtual_environment
|
| 235 |
+
dist
|
| 236 |
+
crf_py_utils.egg-info
|
| 237 |
+
build
|
| 238 |
+
datas
|
| 239 |
+
tests/data
|
| 240 |
+
venv
|
| 241 |
+
create_docker_image.sh
|
| 242 |
+
|
| 243 |
+
anydonebert/data
|
| 244 |
+
|
| 245 |
+
results
|
| 246 |
+
train_test_split
|
| 247 |
+
|
| 248 |
+
anydonebert/models/sbert.net_models_paraphrase-distilroberta-base-v1
|
| 249 |
+
anydonebert/models/sbert.net_models_paraphrase-distilroberta-base-v2
|
| 250 |
+
resources/conll_files/*
|
| 251 |
+
resources/test_xml_files/*
|
| 252 |
+
resources/xml_files/*
|
| 253 |
+
config.ini
|
| 254 |
+
flowcess/commons/settings.py
|
| 255 |
+
|
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
Data/__init__.py
ADDED
|
File without changes
|
Data/get_video_link.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from Data.new_video_added import get_new_video_url
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
load_dotenv()
|
| 9 |
+
|
| 10 |
+
api_key = os.getenv('API_KEY')
|
| 11 |
+
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 12 |
+
BASE_URL = "https://www.googleapis.com/youtube/v3"
|
| 13 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 14 |
+
channel = "https://www.youtube.com/@hubermanlab/videos"
|
| 15 |
+
new_video_added = False
|
| 16 |
+
# video_links_folder_name = os.path.join(BASE_DIR, "videolinks")
|
| 17 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 18 |
+
# print("THIS IS BASE DIR:", BASE_DIR)
|
| 19 |
+
# print("THIS is current dir:", CURRENT_DIR)
|
| 20 |
+
# video_links_folder_name = os.path.join(CURRENT_DIR, "videolinks")
|
| 21 |
+
video_links_folder_name = os.path.join(PROJECT_ROOT, "Data", "video_links")
|
| 22 |
+
|
| 23 |
+
def ensure_directories():
|
| 24 |
+
if not os.path.exists(video_links_folder_name):
|
| 25 |
+
os.makedirs(video_links_folder_name)
|
| 26 |
+
print(f"Directory {video_links_folder_name} created")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def get_chanel_id(chanel_name):
|
| 30 |
+
url = f"{BASE_URL}/search"
|
| 31 |
+
params = {
|
| 32 |
+
"part": "snippet",
|
| 33 |
+
"q": chanel_name,
|
| 34 |
+
"type": "channel",
|
| 35 |
+
"key": api_key
|
| 36 |
+
}
|
| 37 |
+
response = requests.get(url, params)
|
| 38 |
+
response_data = response.json()
|
| 39 |
+
if "items" in response_data and len(response_data["items"]) > 0:
|
| 40 |
+
return response_data["items"][0]["snippet"]["channelId"]
|
| 41 |
+
else:
|
| 42 |
+
return None
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def get_video_links(channel_id):
|
| 46 |
+
url = f"{BASE_URL}/search"
|
| 47 |
+
video_links = []
|
| 48 |
+
next_page_token = None
|
| 49 |
+
|
| 50 |
+
while True:
|
| 51 |
+
params = {
|
| 52 |
+
"part": "snippet",
|
| 53 |
+
"channelId": channel_id,
|
| 54 |
+
"maxResults": 50,
|
| 55 |
+
"type": "video",
|
| 56 |
+
"key": api_key,
|
| 57 |
+
}
|
| 58 |
+
if next_page_token:
|
| 59 |
+
params["pageToken"] = next_page_token
|
| 60 |
+
|
| 61 |
+
response = requests.get(url, params=params)
|
| 62 |
+
response_data = response.json()
|
| 63 |
+
|
| 64 |
+
if "items" not in response_data:
|
| 65 |
+
break
|
| 66 |
+
|
| 67 |
+
for item in response_data["items"]:
|
| 68 |
+
video_id = item["id"]["videoId"]
|
| 69 |
+
video_links.append(f"https://www.youtube.com/watch?v={video_id}")
|
| 70 |
+
|
| 71 |
+
next_page_token = response_data.get("nextPageToken")
|
| 72 |
+
if not next_page_token:
|
| 73 |
+
break
|
| 74 |
+
|
| 75 |
+
return video_links
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def save_video_links(video_links):
|
| 79 |
+
if not os.path.exists(video_links_folder_name):
|
| 80 |
+
os.makedirs(video_links_folder_name)
|
| 81 |
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
| 82 |
+
filename = f"video_links_{timestamp}.json"
|
| 83 |
+
filepath = os.path.join(video_links_folder_name, filename)
|
| 84 |
+
with open(filepath, 'w') as file:
|
| 85 |
+
json.dump(video_links, file)
|
| 86 |
+
print(f"{len(video_links)} The video links is saved successfully to {filename}")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def load_video_links():
|
| 90 |
+
"""
|
| 91 |
+
Load the most recent video links file based on timestamp in the filename.
|
| 92 |
+
"""
|
| 93 |
+
# List all files in the current directory
|
| 94 |
+
if not os.path.exists(video_links_folder_name):
|
| 95 |
+
print(f"{video_links_folder_name} does not exits")
|
| 96 |
+
files = [f for f in os.listdir(video_links_folder_name) if f.startswith("video_links_") and f.endswith(".json")]
|
| 97 |
+
|
| 98 |
+
if not files:
|
| 99 |
+
print("No video links file found.")
|
| 100 |
+
return []
|
| 101 |
+
|
| 102 |
+
# Sort files by the timestamp in their names (descending)
|
| 103 |
+
files.sort(key=lambda x: datetime.strptime(x[len("video_links_"):-len(".json")], "%Y%m%d%H%M%S"), reverse=True)
|
| 104 |
+
|
| 105 |
+
# Load the most recent file
|
| 106 |
+
latest_file = files[0]
|
| 107 |
+
filepath = os.path.join(video_links_folder_name, latest_file)
|
| 108 |
+
try:
|
| 109 |
+
with open(filepath, 'r') as file:
|
| 110 |
+
video_links = json.load(file)
|
| 111 |
+
print(f"{len(video_links)} video links loaded successfully from {latest_file}.")
|
| 112 |
+
return video_links
|
| 113 |
+
except Exception as e:
|
| 114 |
+
print(f"Error loading {latest_file}: {e}")
|
| 115 |
+
return []
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def video_links_main():
|
| 119 |
+
ensure_directories()
|
| 120 |
+
video_links = load_video_links()
|
| 121 |
+
if video_links:
|
| 122 |
+
print(f"Using {len(video_links)} saved video links")
|
| 123 |
+
else:
|
| 124 |
+
channel_name = input("Enter the YouTube channel name: ")
|
| 125 |
+
channel_id = get_chanel_id(channel_name)
|
| 126 |
+
|
| 127 |
+
if channel_id:
|
| 128 |
+
print(f"Fetching videos for channel: {channel_name} (ID: {channel_id})")
|
| 129 |
+
video_links = get_video_links(channel_id)
|
| 130 |
+
save_video_links(video_links)
|
| 131 |
+
else:
|
| 132 |
+
print("Failed to fetch video links")
|
| 133 |
+
# for link in video_links:
|
| 134 |
+
# # print(link)
|
| 135 |
+
new_video_url = get_new_video_url(channel)
|
| 136 |
+
# new_video_url = new_video_url[:3]
|
| 137 |
+
new_videos = [url for url in new_video_url if url not in video_links]
|
| 138 |
+
|
| 139 |
+
if new_videos:
|
| 140 |
+
print(f"{len(new_videos)} new video founds")
|
| 141 |
+
video_links.extend(new_videos)
|
| 142 |
+
save_video_links(video_links)
|
| 143 |
+
new_video_added = True
|
| 144 |
+
else:
|
| 145 |
+
print("No new video founds")
|
| 146 |
+
new_video_added = False
|
| 147 |
+
# print(new_video_added)
|
| 148 |
+
return video_links, new_video_added, new_videos
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
video_links_main()
|
Data/new_video_added.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def get_new_video_url(channel):
|
| 7 |
+
"""
|
| 8 |
+
Fetch all video URLs from the given YouTube channel page.
|
| 9 |
+
"""
|
| 10 |
+
try:
|
| 11 |
+
html = requests.get(channel).text
|
| 12 |
+
# Extract all video IDs from the HTML
|
| 13 |
+
video_ids = re.findall(r'(?<="videoId":").*?(?=")', html)
|
| 14 |
+
video_urls = [f"https://www.youtube.com/watch?v={video_id}" for video_id in video_ids]
|
| 15 |
+
|
| 16 |
+
# Remove duplicates while preserving order
|
| 17 |
+
video_urls = list(dict.fromkeys(video_urls))
|
| 18 |
+
print(f"Fetched {len(video_urls)} video URLs from the channel.")
|
| 19 |
+
return video_urls
|
| 20 |
+
except Exception as e:
|
| 21 |
+
print(f"Error fetching video URLs: {e}")
|
| 22 |
+
return []
|
Data/yt_transcript.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 2 |
+
from Data.get_video_link import video_links_main
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
# Dynamically get the root directory of the project
|
| 7 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent # Moves up from /Data/
|
| 8 |
+
TRANSCRIPTS_FOLDER = PROJECT_ROOT / "Data" / "transcripts"
|
| 9 |
+
|
| 10 |
+
def save_transcript(video_id, transcript_text):
|
| 11 |
+
"""
|
| 12 |
+
Saves transcripts to the local folder
|
| 13 |
+
"""
|
| 14 |
+
# Ensure the transcripts folder exists
|
| 15 |
+
TRANSCRIPTS_FOLDER.mkdir(parents=True, exist_ok=True)
|
| 16 |
+
|
| 17 |
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
| 18 |
+
filename = f"{video_id}_{timestamp}.txt"
|
| 19 |
+
file_path = TRANSCRIPTS_FOLDER / filename
|
| 20 |
+
|
| 21 |
+
file_path.write_text('\n'.join(transcript_text), encoding="utf-8")
|
| 22 |
+
return file_path
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_video_id(video_links_list):
|
| 26 |
+
return [link.replace("https://www.youtube.com/watch?v=", "") for link in video_links_list]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def fetch_yt_transcript(video_ids):
|
| 30 |
+
"""
|
| 31 |
+
Fetches YouTube transcripts using video IDs.
|
| 32 |
+
"""
|
| 33 |
+
video_transcripts = {}
|
| 34 |
+
|
| 35 |
+
for video_id in video_ids:
|
| 36 |
+
print(f"Fetching transcript for: {video_id}")
|
| 37 |
+
try:
|
| 38 |
+
output = YouTubeTranscriptApi.get_transcript(video_id)
|
| 39 |
+
transcript_text = [item['text'] for item in output]
|
| 40 |
+
|
| 41 |
+
# Save transcript and get file path
|
| 42 |
+
file_path = save_transcript(video_id, transcript_text)
|
| 43 |
+
video_transcripts[video_id] = {
|
| 44 |
+
'text': transcript_text,
|
| 45 |
+
'file_path': str(file_path)
|
| 46 |
+
}
|
| 47 |
+
print(f"Transcript saved to: {file_path}")
|
| 48 |
+
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"Transcript not found for video: {video_id}")
|
| 51 |
+
video_transcripts[video_id] = {
|
| 52 |
+
'text': [],
|
| 53 |
+
'file_path': None
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
return video_transcripts
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def all_video_transcript_pipeline():
|
| 60 |
+
"""
|
| 61 |
+
Handles fetching and storing transcripts, checking for new videos.
|
| 62 |
+
"""
|
| 63 |
+
print(f"Looking for transcripts in: {TRANSCRIPTS_FOLDER}")
|
| 64 |
+
video_links_list, new_video_added, new_videos_link = video_links_main()
|
| 65 |
+
video_transcripts = {}
|
| 66 |
+
|
| 67 |
+
# Always load existing transcripts
|
| 68 |
+
if TRANSCRIPTS_FOLDER.exists():
|
| 69 |
+
existing_files = list(TRANSCRIPTS_FOLDER.glob("*.txt"))
|
| 70 |
+
print(f"Found {len(existing_files)} transcript files.")
|
| 71 |
+
|
| 72 |
+
for file in existing_files:
|
| 73 |
+
video_id = file.stem.split("_")[0] # Extract video ID
|
| 74 |
+
try:
|
| 75 |
+
transcript_text = file.read_text(encoding="utf-8").splitlines()
|
| 76 |
+
video_transcripts[video_id] = {
|
| 77 |
+
'text': transcript_text,
|
| 78 |
+
'file_path': str(file)
|
| 79 |
+
}
|
| 80 |
+
print(f"Loaded transcript for video: {video_id}")
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"Error loading transcript {file.name}: {e}")
|
| 83 |
+
else:
|
| 84 |
+
print(f"Transcripts folder not found at: {TRANSCRIPTS_FOLDER}, creating it.")
|
| 85 |
+
TRANSCRIPTS_FOLDER.mkdir(parents=True, exist_ok=True)
|
| 86 |
+
|
| 87 |
+
# Fetch new transcripts if needed
|
| 88 |
+
if new_video_added and new_videos_link:
|
| 89 |
+
print("New videos detected... Fetching transcripts.")
|
| 90 |
+
new_video_ids = [url.split("v=")[1] for url in new_videos_link] # Extract video IDs
|
| 91 |
+
new_transcripts = fetch_yt_transcript(new_video_ids)
|
| 92 |
+
|
| 93 |
+
print(f"Total transcripts loaded: {len(video_transcripts)}")
|
| 94 |
+
|
Dockerfile
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Declare build arguments at the top (for initial stage)
|
| 2 |
+
ARG USER_UID=1000
|
| 3 |
+
ARG USER_GID=1000
|
| 4 |
+
|
| 5 |
+
# Stage 1: Build dependencies
|
| 6 |
+
FROM python:3.11-slim AS builder
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
RUN apt-get update && \
|
| 9 |
+
apt-get install -y --no-install-recommends \
|
| 10 |
+
build-essential \
|
| 11 |
+
git && \
|
| 12 |
+
rm -rf /var/lib/apt/lists/*
|
| 13 |
+
RUN python -m venv /opt/venv
|
| 14 |
+
ENV PATH="/opt/venv/bin:$PATH"
|
| 15 |
+
COPY requirements.txt .
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Stage 2: Final image
|
| 19 |
+
FROM python:3.11-slim
|
| 20 |
+
|
| 21 |
+
# Re-declare build arguments for this stage
|
| 22 |
+
ARG USER_UID=1000
|
| 23 |
+
ARG USER_GID=1000
|
| 24 |
+
|
| 25 |
+
COPY --from=builder /opt/venv /opt/venv
|
| 26 |
+
ENV PATH="/opt/venv/bin:$PATH"
|
| 27 |
+
WORKDIR /app
|
| 28 |
+
RUN apt-get update && \
|
| 29 |
+
apt-get install -y --no-install-recommends \
|
| 30 |
+
libgomp1 && \
|
| 31 |
+
rm -rf /var/lib/apt/lists/*
|
| 32 |
+
|
| 33 |
+
COPY . .
|
| 34 |
+
|
| 35 |
+
# Create the group and user first
|
| 36 |
+
RUN groupadd -g ${USER_GID} appuser && \
|
| 37 |
+
useradd -m -u ${USER_UID} -g appuser appuser
|
| 38 |
+
|
| 39 |
+
# Create directories and set permissions
|
| 40 |
+
RUN mkdir -p /app/Rag/chromadb.db && \
|
| 41 |
+
mkdir -p /app/Data && \
|
| 42 |
+
chown -R appuser:appuser /app
|
| 43 |
+
|
| 44 |
+
USER appuser
|
| 45 |
+
|
| 46 |
+
# Make sure your Python code uses this path for ChromaDB
|
| 47 |
+
ENV CHROMA_PERSISTENCE_DIRECTORY=/app/Rag/chromadb.db
|
| 48 |
+
|
| 49 |
+
CMD ["python", "-m","ui.app"]
|
Example/__init__.py
ADDED
|
File without changes
|
Example/rag_example.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import chromadb
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 5 |
+
# transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
|
| 6 |
+
# transcripts_folder_path = 'Data/transcripts'
|
| 7 |
+
transcripts_folder_path = PROJECT_ROOT / "Data" / "transcripts"
|
| 8 |
+
chromadb_path = PROJECT_ROOT / "Rag" / "chromadb.db"
|
| 9 |
+
client = chromadb.PersistentClient(path=str(chromadb_path))
|
| 10 |
+
collection = client.get_or_create_collection(name="yt_transcript_collection")
|
| 11 |
+
sys.path.append(str(PROJECT_ROOT))
|
| 12 |
+
sys.path.append(str(PROJECT_ROOT / "Rag"))
|
| 13 |
+
# print("Python path:", sys.path)
|
| 14 |
+
from Rag.rag_pipeline import main_workflow
|
| 15 |
+
|
| 16 |
+
# Run the application
|
| 17 |
+
if __name__ == "__main__":
|
| 18 |
+
main_workflow(transcripts_folder_path, collection)
|
Llm/__init__.py
ADDED
|
File without changes
|
Llm/llm_endpoints.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
import os
|
| 3 |
+
import google.generativeai as genai
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Configure the Generative AI model with the API key from the environment
|
| 7 |
+
load_dotenv()
|
| 8 |
+
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
| 9 |
+
gemini_model = genai.GenerativeModel("models/gemini-1.5-flash")
|
| 10 |
+
|
| 11 |
+
# Function to get a response from the generative model
|
| 12 |
+
def get_llm_response(prompt: str) -> str:
|
| 13 |
+
response = gemini_model.generate_content(prompt)
|
| 14 |
+
return response.text
|
Prompts/__init__.py
ADDED
|
File without changes
|
Prompts/huberman_prompt.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
huberman_prompt = """
|
| 2 |
+
You are Dr. Andrew Huberman, an expert neuroscientist and educator known for your clear, engaging, and scientifically accurate explanations. When answering, please consider the following:
|
| 3 |
+
1. Provide a clear and concise summary of the scientific concepts involved.
|
| 4 |
+
2. Highlight any relevant research or studies.
|
| 5 |
+
3. Offer actionable insights or practical advice.
|
| 6 |
+
|
| 7 |
+
Context:
|
| 8 |
+
{context}
|
| 9 |
+
|
| 10 |
+
Sources:
|
| 11 |
+
{sources}
|
| 12 |
+
|
| 13 |
+
Conversation History:
|
| 14 |
+
{history}
|
| 15 |
+
|
| 16 |
+
Question:
|
| 17 |
+
{question}
|
| 18 |
+
|
| 19 |
+
Please respond in a manner that is informative, research-backed, and reflective of your unique style.
|
| 20 |
+
"""
|
Prompts/summary_prompt.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
summary_prompts = """
|
| 2 |
+
#System
|
| 3 |
+
You are an AI agents whose job is to summarize the conversation between AI bots and the user
|
| 4 |
+
here is the conversation history
|
| 5 |
+
{{}}
|
| 6 |
+
|
| 7 |
+
#Output format
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
"""
|
README.md
CHANGED
|
@@ -12,3 +12,102 @@ short_description: a bot
|
|
| 12 |
---
|
| 13 |
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
---
|
| 13 |
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 15 |
+
# Andrew Huberman RAG-Based AI Chatbot
|
| 16 |
+
|
| 17 |
+
## Overview
|
| 18 |
+
Xyzbot is an AI chatbot that extracts and synthesizes insights from Andrew Huberman's YouTube videos. It automatically retrieves video transcripts, updates its knowledge base in ChromaDB, and provides citation-linked responses.
|
| 19 |
+
|
| 20 |
+
## 🚀 Key Features
|
| 21 |
+
- Mimics Andrew Huberman's insights using YouTube video transcripts
|
| 22 |
+
- Automatic transcript retrieval and knowledge base updates
|
| 23 |
+
- RAG-powered response generation with direct video citations
|
| 24 |
+
- Interactive Streamlit user interface
|
| 25 |
+
- Docker-based deployment for easy scalability
|
| 26 |
+
|
| 27 |
+
## 🛠 Tech Stack
|
| 28 |
+
- Backend: Python, LangChain, OpenAI API
|
| 29 |
+
- Frontend: Streamlit
|
| 30 |
+
- Database: ChromaDB
|
| 31 |
+
- Deployment: Docker
|
| 32 |
+
|
| 33 |
+
## 📂 Project Structure
|
| 34 |
+
```
|
| 35 |
+
📦 Xyzbot
|
| 36 |
+
├── 📂 Data
|
| 37 |
+
├── 📂 Example
|
| 38 |
+
├── 📂 Llm
|
| 39 |
+
├── 📂 Notebook
|
| 40 |
+
├── 📂 Prompts
|
| 41 |
+
├── 📂 Rag
|
| 42 |
+
│ ├── chromadb.db
|
| 43 |
+
│ └── 📂 Processed_folder
|
| 44 |
+
├── 📂 utils
|
| 45 |
+
├── Dockerfile
|
| 46 |
+
└── pyproject.toml
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## 🔧 Prerequisites
|
| 50 |
+
- Python 3.8+
|
| 51 |
+
- Docker (optional)
|
| 52 |
+
|
| 53 |
+
## 🔑 API Keys Required
|
| 54 |
+
1. Google Gemini API Key
|
| 55 |
+
2. YouTube API Key
|
| 56 |
+
|
| 57 |
+
## 🚀 Installation
|
| 58 |
+
|
| 59 |
+
### Local Setup
|
| 60 |
+
1. Clone the repository
|
| 61 |
+
```bash
|
| 62 |
+
git clone https://github.com/Angel-dash/Xyzbot.git
|
| 63 |
+
cd Xyzbot
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
2. Create virtual environment
|
| 67 |
+
```bash
|
| 68 |
+
python3 -m venv venv
|
| 69 |
+
source venv/bin/activate
|
| 70 |
+
pip install -r requirements.txt
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
### Docker Setup
|
| 74 |
+
|
| 75 |
+
#### Option 1: Build Locally
|
| 76 |
+
```bash
|
| 77 |
+
docker build -t xyzbot:v1.0 .
|
| 78 |
+
docker run -it \
|
| 79 |
+
-v $(pwd)/Rag:/app/Rag:rw \
|
| 80 |
+
-e GOOGLE_API_KEY=your_api_key \
|
| 81 |
+
xyzbot:v1.0
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
#### Option 2: Pull from Docker Hub
|
| 85 |
+
```bash
|
| 86 |
+
docker pull angeldash/xyzbot:v1.0
|
| 87 |
+
docker run -it \
|
| 88 |
+
-v $(pwd)/Rag:/app/Rag:rw \
|
| 89 |
+
-e GOOGLE_API_KEY=your_api_key \
|
| 90 |
+
angeldash/xyzbot:v1.0
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
## 🖥️ Running the Application
|
| 94 |
+
```bash
|
| 95 |
+
streamlit run src/main.py
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
## 📈 Future Roadmap
|
| 99 |
+
- Fine-tuned LLM response generation
|
| 100 |
+
- Real-time multi-channel monitoring
|
| 101 |
+
- Enhanced citation formatting
|
| 102 |
+
- AI agent conversation handling
|
| 103 |
+
- Performance optimization
|
| 104 |
+
|
| 105 |
+
## 📜 License
|
| 106 |
+
MIT License
|
| 107 |
+
|
| 108 |
+
## 🤝 Contributing
|
| 109 |
+
Contributions are welcome! Open an issue or submit a pull request.
|
| 110 |
+
|
| 111 |
+
---
|
| 112 |
+
**Author:** Angel Dash | **GitHub:** [@Angel-dash](https://github.com/Angel-dash)
|
| 113 |
+
|
Rag/Processed_folder/processed_files.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
["VOfwbcveP84_20241225194621.txt", "In9Bq4EJMZw_20241225194705.txt", "DkS1pkKpILY_20241225194325.txt", "ajneRM-ET1Q_20241225194311.txt", "K4Ze-Sp6aUE_20241225194709.txt", "n28W4AmvMDE_20241225194626.txt", "UIy-WQCZd4M_20241225194819.txt", "etbfLTHD_VU_20241225194439.txt", "PVmQOLYckKQ_20241225194814.txt", "F9KrZd_-ge0_20241225194812.txt", "xjEFo3a1AnI_20241225194539.txt", "szqPAPKE5tQ_20241225194712.txt", "3_auLYOilb8_20241225194826.txt", "acgz0C-z-gc_20241225194817.txt", "zVCaYyUWWSw_20241225194412.txt", "doupx8SAs5Y_20241225194603.txt", "wAZn9dF3XTo_20241225194423.txt", "2XGREPnlI8U_20241225194659.txt", "UNCwdFxPtE8_20241225194521.txt", "at37Y8rKDlA_20241225194513.txt", "oL3SkPV1_Ik_20241225194837.txt", "nOgypsWKjm4_20241225194440.txt", "rW9QKc-iFoY_20241225194751.txt", "CQlTmOFM4Qs_20241225194550.txt", "tR73Ny4Dt9s_20241225194413.txt", "t1F7EEGPQwo_20241225194649.txt", "ccrbE0QHy94_20241225194608.txt", "SyWC8ZFVxGo_20241225194333.txt", "zlc4VrDx_qk_20241225194800.txt", "8qaBpM73NSk_20241225194409.txt", "sxgCC4H1dl8_20241225194524.txt", "RBK5KLA5Jjg_20241225194446.txt", "slUCmZJDXrk_20241225194627.txt", "h2aWYjSA1Jc_20241225194702.txt", "Ov4yyK15-K8_20241225194230.txt", "juD99_sPWGU_20241225194340.txt", "q1Ss8sTbFBY_20241225194647.txt", "X8Hw8zeCDTA_20241225194518.txt", "UChhXiFPRgg_20241225194443.txt", "pq6WHJzOkno_20241225194415.txt", "2Ds1m5gflCI_20241225194849.txt", "jGZ1mR9uLU0_20241225194808.txt", "VAEzZeaV5zM_20241225194347.txt", "EhlIkzJwPlk_20241225194656.txt", "HiyzzcuaAac_20241225194255.txt", "C3X0bUAiluE_20241225194259.txt", "kG5Qb9sr0YQ_20241225194810.txt", "wRsX_ZkzxvQ_20241225194619.txt", "U2BPitASUh0_20241225194358.txt", "Wcs2PFz5q6g_20241225194327.txt", "CuzL1qxUyHw_20241225194312.txt", "q37ARYnRDGc_20241225194623.txt", "cp9GXl9Qk_s_20241225194735.txt", "XT_6Lvkhxvo_20241225194342.txt", "bUr_9fgfnto_20241225194256.txt", "LTGGyQS1fZE_20241225194305.txt", "mAlt_HKX4as_20241225194420.txt", "SZSRgyl7pyQ_20241225194418.txt", "RI112zW8GDw_20241225194356.txt", "ycOBZZeVeAc_20241225194707.txt", "6YLdlK2hYnw_20241225194328.txt", "p4ZfkezDTXQ_20241225194615.txt", "LVxL_p_kToc_20241225194558.txt", "HXzTbCEqCJc_20241225194710.txt", "yOoVz9E9kfQ_20241225194901.txt", "C5KpIXjpzdY_20241225194400.txt", "__RAXBLt1iM_20241225194430.txt", "8N7mdkrXgbc_20241225194338.txt", "JnlSDaBjCCU_20241225194450.txt", "IOl28gj_RXw_20241225194431.txt", "Nr5xb-QCBGA_20241225194354.txt", "GzvzWO0NU50_20241225194605.txt", "DtmwtjOoSYU_20241225194633.txt", "CrtR12PBKb0_20241225194632.txt", "gMRph_BvHB4_20241225194516.txt", "QpoaNklmRPc_20241225194248.txt", "9tRohh0gErM_20241225194353.txt", "Xu1FMCxoEFc_20241225194346.txt", "15R2pMqU2ok_20241225194406.txt", "eIxVfln02Ss_20241225194335.txt", "0Dtt95_xabw_20241225194252.txt", "3ZGItIAUQmI_20241225194719.txt", "uxZFl4BDOGk_20241225194757.txt", "hvPGfcAgk9Y_20241225194754.txt", "HYVeP4F0GNU_20241225194559.txt", "z5W74QC3v2I_20241225194308.txt", "31wjVhCcI5Y_20241225194426.txt", "BMTt8gSl13s_20241225194836.txt", "aQDOU3hPci0_20241225194501.txt", "tkH2-_jMCSk_20241225194543.txt", "ntfcfJ28eiU_20241225194522.txt", "S8nPJU9xkNw_20241225194748.txt", "fcxjwA4C4Cw_20241225194553.txt", "iMvtHqLmEkI_20241225194855.txt", "099hgtRoUZw_20241225194436.txt", "4RFEkGKKhdE_20241225194907.txt", "eJU6Df_ffAE_20241225194635.txt", "nqNEtdHVUjM_20241225194437.txt", "1SXDXdngX2M_20241225194316.txt", "X4QE6t-MkYE_20241225194642.txt", "79p1X_7rAMo_20241225194630.txt", "6RZbGrq9BxE_20241225194306.txt", "pkJi9Raxikg_20241225194824.txt", "QbMxDZeB8Ks_20241225194247.txt", "RgAcOqVRfYA_20241225194657.txt", "ncSoor2Iw8k_20241225194833.txt", "i_DEPeCKxs8_20241225194235.txt", "FE0lTEUa7EY_20241225194753.txt", "gE0_8AjTFaM_20241225194852.txt", "kgr22uMsJ5o_20241225194317.txt", "ufsIA5NARIo_20241225194535.txt", "CyDLbrZK75U_20241225194434.txt", "7TkGDj4LaOU_20241225194244.txt", "XLr2RKoD-oY_20241225194738.txt", "yb5zpo5WDG4_20241225194645.txt", "a9yFKPmPZ90_20241225194556.txt", "TG8VM5-CTfw_20241225194636.txt", "eMqWH3LYiII_20241225194351.txt", "CVh3_8e5u8I_20241225194246.txt", "SuR0DaYoe0Y_20241225194302.txt", "FLxIoNguGRU_20241225194233.txt", "GA89kjVY6Ik_20241225194854.txt", "qJ3uV7coZbA_20241225194453.txt", "EQ3GjpGq5Y8_20241225194405.txt", "yOJvm_ri_hk_20241225194555.txt", "cwakOgHIT0E_20241225194421.txt", "DTCmprPCDqc_20241225194733.txt", "qPKd99Pa2iU_20241225194500.txt", "nm1TxQj9IsQ_20241225194611.txt", "LRM5LutB538_20241225194857.txt", "xTtM2AvCRyA_20241225194643.txt", "62lVH-6xYGY_20241225194250.txt", "Rxmv7rT9leo_20241225194417.txt", "ulHrUVV3Kq4_20241225194452.txt", "bGixnNGvSkg_20241225194231.txt", "1CxJVdeyltw_20241225194614.txt", "wgUjIRtote8_20241225194726.txt", "qPKd99Pa2iU_20241225194503.txt", "S_SrHS8FvMM_20241225194807.txt", "xX6hiEmDmxs_20241225194227.txt", "uXs-zPc63kM_20241225194449.txt", "4AwyVTHEU3s_20241225194904.txt", "xaE9XyMMAHY_20241225194848.txt", "hFL6qRIJZ_Y_20241225194428.txt", "FOi5s3OUogo_20241225194245.txt", "cS7cNaBrkxo_20241225194624.txt", "kpTJqwIfHcM_20241225194654.txt", "yixIc1Ai6jM_20241225194829.txt", "vfRtLI6cJrk_20241225194324.txt", "GLgKkG44MGo_20241225194729.txt", "KPlJcD-o-4Q_20241225194617.txt", "AtChcxeaukQ_20241225194646.txt", "tLS6t3FVOTI_20241225194714.txt", "GqPGXG5TlZw_20241225194541.txt", "UF0nqolsNZc_20241225194727.txt", "7R3-3HR6-u4_20241225194519.txt", "tLRCS48Ens4_20241225194447.txt", "V0Sdgn0_kFM_20241225194740.txt", "G1VUSu6sGoU_20241225194251.txt", "m_OazsImOiI_20241225194322.txt", "Og56hmAspV8_20241225194258.txt", "dFR_wFN23ZY_20241225194640.txt", "q-H_A_dQUxQ_20241225194303.txt", "KVjfFN89qvQ_20241225194314.txt", "zU5EYw06wtw_20241225194349.txt", "Z7MU6zrAXsM_20241225194442.txt", "LYYyQcAJZfk_20241225194508.txt", "E7W4OQfJWdw_20241225194717.txt", "azb3Ih68awQ_20241225194505.txt", "ouCWNRvPk20_20241225194401.txt", "uwWOc_RqTBA_20241225194858.txt", "pZX8ikmWvEU_20241225194510.txt", "n9IxomBusuw_20241225194545.txt", "BwyZIWeBpRw_20241225194534.txt", "XY0rBdaDXD8_20241225194226.txt", "1Wo6SqLNmLk_20241225194845.txt", "ddq8JIMhz7c_20241225194529.txt", "VQLU7gpk_X8_20241225194821.txt", "jC8Pu9HBd48_20241225194321.txt", "rZkMpVLcVsg_20241225194319.txt", "gbQFSMayJxk_20241225194736.txt", "F54qXuTpgfM_20241225194843.txt", "p3JLaF_4Tz8_20241225194537.txt", "FeRgqJVALMQ_20241225194433.txt", "hF32FvBH4gI_20241225194332.txt", "CDUetQMKM6g_20241225194454.txt", "wG3UFHR1o48_20241225194229.txt", "6P8hrzjnetU_20241225194336.txt", "WFcYF_pxLgA_20241225194458.txt", "77CdVSpnUX4_20241225194746.txt", "VOfwbcveP84_20241225194742.txt", "VRvn3Oj5r3E_20241225194839.txt", "Gf-kC30SLtc_20241225194846.txt", "S8jWFcDGz4Y_20241225194805.txt", "x3MgDtZovks_20241225194526.txt", "lIo9FcrljDk_20241225194309.txt", "-e9ErUozQo4_20241225194903.txt", "aXvDEmo6uS4_20241225194629.txt", "3gtvNYa3Nd8_20241225194531.txt", "5tYR7e5Wpyc_20241225194238.txt", "OadokY8fcAA_20241225194601.txt", "O640yAgq5f8_20241225194744.txt", "zbpb1wd-wvs_20241225194827.txt", "gXvuJu1kt48_20241225194638.txt", "zEYE-vcVKy8_20241225194547.txt", "Ky-ZJ9SS-x4_20241225194240.txt", "0RYyQRQFgFk_20241225194532.txt", "4F_RBc1akC8_20241225194724.txt", "nDLb8_wgX50_20241225194540.txt", "tcueMCe-0zo_20241225194236.txt", "K-TW2Chpz4k_20241225194330.txt", "XcvhERcZpWw_20241225194731.txt", "Ze2pc6NwsHQ_20241225194704.txt", "_ltcLEM-5HU_20241225194612.txt", "jouFvyRZntk_20241225194507.txt", "uWV9a3zEaL4_20241225194823.txt", "-OBCwiPPfEU_20241225194747.txt", "dzOvi0Aa2EA_20241225194301.txt", "K9lORz2_XSU_20241225194527.txt", "j2sMqSDLd4k_20241225194407.txt", "oNkDA2F7CjM_20241225194651.txt", "50BZQRT1dAg_20241225194403.txt", "q8CHXefn7B4_20241225194411.txt", "Jy4rJcYmtUM_20241225194344.txt", "QmOF0crdyRU_20241225194456.txt", "6ZrlsVx85ek_20241225194758.txt", "CD0bRU1e1ZM_20241225194425.txt", "IAnhFUUCq6c_20241225194804.txt", "Phm-Alz1Zjo_20241225194906.txt", "csubiPlvFWk_20241225194606.txt", "GpgqXCkRO-w_20241225194701.txt", "W5zqC5cYcS0_20241225194241.txt", "T65RDBiB5Hs_20241225194715.txt", "6I5I56uVvLw_20241225194801.txt", "i5611OvTFGM_20241225194548.txt", "wTBSGgbIvsY_20241225194552.txt", "O1YRwWmue4Y_20241225194815.txt", "29n0WG317tM_20241225194511.txt", "xmhsWAqP_0Y_20241225194851.txt", "x4m_PdFbu-s_20241225194722.txt"]
|
Rag/__init__.py
ADDED
|
File without changes
|
Rag/rag_pipeline.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import chromadb
|
| 2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
import google.generativeai as genai
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 8 |
+
from Llm.llm_endpoints import get_llm_response
|
| 9 |
+
from utils.get_link import get_source_link
|
| 10 |
+
from Prompts.huberman_prompt import huberman_prompt
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
# Configuration
|
| 13 |
+
API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 14 |
+
if API_KEY:
|
| 15 |
+
genai.configure(api_key=API_KEY)
|
| 16 |
+
|
| 17 |
+
chromadb_path = "app/Rag/chromadb.db"
|
| 18 |
+
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 19 |
+
|
| 20 |
+
# Logging
|
| 21 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# Helper Functions
|
| 25 |
+
def split_text_to_chunks(docs, chunk_size=1000, chunk_overlap=200):
|
| 26 |
+
"""Split text into manageable chunks."""
|
| 27 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 28 |
+
chunks = text_splitter.split_text(docs)
|
| 29 |
+
return chunks
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_new_files(transcripts_folder_path, collection):
|
| 33 |
+
"""Find new transcript files that haven't been processed yet."""
|
| 34 |
+
all_files = [f for f in os.listdir(transcripts_folder_path) if f.endswith(".txt")]
|
| 35 |
+
existing_files = [meta["source"] for meta in collection.get()['metadatas']]
|
| 36 |
+
return [f for f in all_files if f not in existing_files]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def process_single_file(file_path):
|
| 40 |
+
"""Process a single file and return its chunks."""
|
| 41 |
+
with open(file_path, 'r') as f:
|
| 42 |
+
content = f.read()
|
| 43 |
+
chunks = split_text_to_chunks(content)
|
| 44 |
+
return chunks, os.path.basename(file_path)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def batch_embed_chunks(chunks, batch_size=32):
|
| 48 |
+
"""Embed chunks in batches."""
|
| 49 |
+
embeddings = []
|
| 50 |
+
for i in tqdm(range(0, len(chunks), batch_size),desc = "Embedding chunks"):
|
| 51 |
+
batch = chunks[i:i + batch_size]
|
| 52 |
+
batch_embeddings = embedding_model.encode(batch, show_progress_bar=True)
|
| 53 |
+
embeddings.extend(batch_embeddings.tolist())
|
| 54 |
+
return embeddings
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def process_and_add_new_files(transcripts_folder_path, collection):
|
| 58 |
+
"""Process and add new transcript files to the vector database."""
|
| 59 |
+
new_files = get_new_files(transcripts_folder_path, collection)
|
| 60 |
+
if not new_files:
|
| 61 |
+
logging.info("No new files to process")
|
| 62 |
+
return False
|
| 63 |
+
|
| 64 |
+
# Use a reasonable number of workers (4 is usually a good default)
|
| 65 |
+
n_workers = min(4, len(new_files))
|
| 66 |
+
logging.info(f"Using {n_workers} workers for processing")
|
| 67 |
+
|
| 68 |
+
all_chunks = []
|
| 69 |
+
all_metadata = []
|
| 70 |
+
all_ids = []
|
| 71 |
+
|
| 72 |
+
# Process files in parallel
|
| 73 |
+
with ProcessPoolExecutor(max_workers=n_workers) as executor:
|
| 74 |
+
futures = {
|
| 75 |
+
executor.submit(process_single_file, os.path.join(transcripts_folder_path, file)): file
|
| 76 |
+
for file in new_files
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
for future in as_completed(futures):
|
| 80 |
+
file = futures[future]
|
| 81 |
+
try:
|
| 82 |
+
chunks, filename = future.result()
|
| 83 |
+
file_metadata = [{"source": filename} for _ in range(len(chunks))]
|
| 84 |
+
file_ids = [f"{filename}_chunk_{i}" for i in range(len(chunks))]
|
| 85 |
+
|
| 86 |
+
all_chunks.extend(chunks)
|
| 87 |
+
all_metadata.extend(file_metadata)
|
| 88 |
+
all_ids.extend(file_ids)
|
| 89 |
+
|
| 90 |
+
logging.info(f"Processed {filename}")
|
| 91 |
+
except Exception as e:
|
| 92 |
+
logging.error(f"Error processing {file}: {str(e)}")
|
| 93 |
+
continue
|
| 94 |
+
|
| 95 |
+
# Process embeddings in batches
|
| 96 |
+
logging.info(f"Generating embeddings for {len(all_chunks)} chunks")
|
| 97 |
+
embeddings = batch_embed_chunks(all_chunks)
|
| 98 |
+
|
| 99 |
+
# Add to database in batches
|
| 100 |
+
batch_size = 500
|
| 101 |
+
for i in range(0, len(all_chunks), batch_size):
|
| 102 |
+
end_idx = min(i + batch_size, len(all_chunks))
|
| 103 |
+
collection.upsert(
|
| 104 |
+
documents=all_chunks[i:end_idx],
|
| 105 |
+
embeddings=embeddings[i:end_idx],
|
| 106 |
+
metadatas=all_metadata[i:end_idx],
|
| 107 |
+
ids=all_ids[i:end_idx]
|
| 108 |
+
)
|
| 109 |
+
logging.info(f"Added batch {i // batch_size + 1} to database")
|
| 110 |
+
|
| 111 |
+
logging.info(f"Successfully processed {len(new_files)} files")
|
| 112 |
+
return True
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def query_database(collection, query_text, n_results=3):
|
| 116 |
+
"""Retrieve the most relevant chunks for the query."""
|
| 117 |
+
query_embeddings = embedding_model.encode(query_text).tolist()
|
| 118 |
+
results = collection.query(query_embeddings=query_embeddings, n_results=n_results)
|
| 119 |
+
retrieved_docs = results['documents'][0]
|
| 120 |
+
metadatas = results['metadatas'][0]
|
| 121 |
+
return retrieved_docs, metadatas
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def enhance_query_with_history(query_text, summarized_history):
|
| 125 |
+
enhance_query = f"{query_text}*2\n\n{summarized_history}"
|
| 126 |
+
return enhance_query
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def update_conversation_history(history, user_query, bot_response):
|
| 130 |
+
"""Update and keeps track of conversation history between user and the bot."""
|
| 131 |
+
history.append({"user": user_query, "bot": bot_response})
|
| 132 |
+
return history
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def generate_response(conversation_history, query_text, retrieved_docs, source_links):
|
| 136 |
+
"""Generate a response using retrieved documents and the generative AI model."""
|
| 137 |
+
context = " ".join(retrieved_docs)
|
| 138 |
+
history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
| 139 |
+
sources_str = "\n".join(source_links)
|
| 140 |
+
|
| 141 |
+
prompt = huberman_prompt.format(
|
| 142 |
+
context=context,
|
| 143 |
+
sources=sources_str,
|
| 144 |
+
history=history_str,
|
| 145 |
+
question=query_text
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
response = get_llm_response(prompt)
|
| 149 |
+
full_response = f"{response}\n\nSources:\n{sources_str}"
|
| 150 |
+
return full_response
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def main_workflow(transcripts_folder_path, collection):
|
| 154 |
+
"""Run the full RAG workflow."""
|
| 155 |
+
new_files_added = process_and_add_new_files(transcripts_folder_path, collection)
|
| 156 |
+
if new_files_added:
|
| 157 |
+
logging.info("New transcripts added to the database.")
|
| 158 |
+
else:
|
| 159 |
+
logging.info("No new files found. Using existing database.")
|
| 160 |
+
|
| 161 |
+
conversation_history = []
|
| 162 |
+
|
| 163 |
+
while True:
|
| 164 |
+
query_text = input("\nEnter your query(or type 'exit' to end):").strip()
|
| 165 |
+
if query_text.lower() == "exit":
|
| 166 |
+
print("Ending the conversation. Goodbye")
|
| 167 |
+
break
|
| 168 |
+
|
| 169 |
+
query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
|
| 170 |
+
retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
|
| 171 |
+
print("-" * 50)
|
| 172 |
+
source_link = get_source_link(metadatas)
|
| 173 |
+
print(source_link)
|
| 174 |
+
print("-" * 50)
|
| 175 |
+
|
| 176 |
+
if not retrived_docs:
|
| 177 |
+
print("No relevent documents is found")
|
| 178 |
+
continue
|
| 179 |
+
|
| 180 |
+
response = generate_response(conversation_history, query_text, retrived_docs, source_link)
|
| 181 |
+
conversation_history = update_conversation_history(conversation_history, query_text, response)
|
| 182 |
+
print("\nGenerated Response:")
|
| 183 |
+
print(response)
|
poetry.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "xyzbot"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "A rag application"
|
| 5 |
+
authors = [
|
| 6 |
+
{name = "Angel njlghmr@gmail.com"}
|
| 7 |
+
]
|
| 8 |
+
license = {text = "MIT"}
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python =">=3.11,<3.12"
|
| 11 |
+
dependencies = [
|
| 12 |
+
"pyarrow (>=19.0.0,<20.0.0)",
|
| 13 |
+
"pandas (>=2.2.3,<3.0.0)",
|
| 14 |
+
"pendulum (>=3.0.0,<4.0.0)",
|
| 15 |
+
"google-generativeai (>=0.8.4,<0.9.0)",
|
| 16 |
+
"langchain (>=0.3.16,<0.4.0)",
|
| 17 |
+
"langchain-openai (>=0.3.3,<0.4.0)",
|
| 18 |
+
"langchain-chroma (>=0.2.1,<0.3.0)",
|
| 19 |
+
"langchain-community (>=0.3.16,<0.4.0)",
|
| 20 |
+
"chromadb (>=0.4.14)",
|
| 21 |
+
"pypdf (==4.2.0)",
|
| 22 |
+
"flask (==3.0.1)",
|
| 23 |
+
"flask-cors (==3.0.10)",
|
| 24 |
+
"sentence-transformers (==3.3.1)",
|
| 25 |
+
"tqdm (==4.67.1)",
|
| 26 |
+
"torch (==2.5.1)",
|
| 27 |
+
"transformers (==4.46.3)",
|
| 28 |
+
"pydantic (>=2.7.4,<3.0.0)"
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
[build-system]
|
| 33 |
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
| 34 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.in
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pyarrow
|
| 2 |
+
pandas[performance, parquet, aws]
|
| 3 |
+
pendulum
|
| 4 |
+
google.generativeai
|
| 5 |
+
langchain
|
| 6 |
+
langchain_openai
|
| 7 |
+
langchain_chroma
|
| 8 |
+
langchain_community
|
| 9 |
+
chromadb==0.4.8
|
| 10 |
+
pypdf
|
| 11 |
+
flask
|
| 12 |
+
flask_cors
|
| 13 |
+
sentence_transformers
|
| 14 |
+
tqdm
|
| 15 |
+
torch
|
| 16 |
+
transformers
|
| 17 |
+
spacy==3.5.0
|
| 18 |
+
coreferee==1.4.1
|
requirements.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies
|
| 2 |
+
langchain>=0.3.16,<0.4.0
|
| 3 |
+
langchain_openai
|
| 4 |
+
langchain_chroma
|
| 5 |
+
langchain-community>=0.3.16,<0.4.0
|
| 6 |
+
chromadb>=0.4.14
|
| 7 |
+
flask==3.0.1
|
| 8 |
+
flask_cors==3.0.10
|
| 9 |
+
google.generativeai
|
| 10 |
+
pydantic>=2.7.4,<3.0.0
|
| 11 |
+
streamlit
|
| 12 |
+
# PDF Processing
|
| 13 |
+
pypdf==4.2.0
|
| 14 |
+
|
| 15 |
+
# ML/AI Dependencies (with CPU-only versions)
|
| 16 |
+
sentence_transformers==2.3.1
|
| 17 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 18 |
+
torch==2.1.0+cpu
|
| 19 |
+
|
| 20 |
+
gradio
|
setup.sh
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Install Python dependencies
|
| 2 |
+
pip install -r requirements.txt
|
| 3 |
+
|
| 4 |
+
# Download spaCy model
|
| 5 |
+
python -m spacy download en_core_web_sm
|
| 6 |
+
|
| 7 |
+
# Install Coreferee for English
|
| 8 |
+
python -m coreferee install en
|
| 9 |
+
|
| 10 |
+
echo "Setup completed successfully!"
|
ui/__init__.py
ADDED
|
File without changes
|
ui/app.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import chromadb
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
project_root = Path(__file__).resolve().parent.parent
|
| 8 |
+
sys.path.append(str(project_root))
|
| 9 |
+
sys.path.append(str(project_root / "Rag"))
|
| 10 |
+
sys.path.append(str(project_root / "Data"))
|
| 11 |
+
sys.path.append(str(project_root / "Data" / "transcripts"))
|
| 12 |
+
sys.path.append(str(project_root / "Data" / "video_links"))
|
| 13 |
+
sys.path.append(str(project_root / "Llm"))
|
| 14 |
+
sys.path.append(str(project_root / "Prompts"))
|
| 15 |
+
sys.path.append(str(project_root / "utils"))
|
| 16 |
+
from Rag.rag_pipeline import (
|
| 17 |
+
query_database,
|
| 18 |
+
generate_response,
|
| 19 |
+
enhance_query_with_history,
|
| 20 |
+
update_conversation_history,
|
| 21 |
+
process_and_add_new_files
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
INTRODUCTION = """
|
| 25 |
+
# 🧠 Welcome to HubermanBot!
|
| 26 |
+
|
| 27 |
+
I am your AI assistant trained on Andrew Huberman's podcast content. My knowledge base includes detailed information about:
|
| 28 |
+
|
| 29 |
+
- 🎯 Peak Performance & Focus
|
| 30 |
+
- 😴 Sleep Science & Optimization
|
| 31 |
+
- 🏋️ Physical Fitness & Recovery
|
| 32 |
+
- 🧘 Mental Health & Stress Management
|
| 33 |
+
- 🧪 Neuroscience & Biology
|
| 34 |
+
- 💪 Habit Formation & Behavior Change
|
| 35 |
+
|
| 36 |
+
For each response, I'll provide:
|
| 37 |
+
- Detailed answers based on podcast content
|
| 38 |
+
- Direct source links to specific episodes
|
| 39 |
+
- Scientific context when available
|
| 40 |
+
|
| 41 |
+
Ask me anything about these topics, and I'll help you find relevant information from the Huberman Lab Podcast!
|
| 42 |
+
|
| 43 |
+
Example questions you might ask:
|
| 44 |
+
- "What does Dr. Huberman recommend for better sleep?"
|
| 45 |
+
- "How can I improve my focus and concentration?"
|
| 46 |
+
- "What are the best practices for morning routines?"
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def format_youtube_url(filename: str) -> str:
|
| 51 |
+
"""Convert filename to YouTube URL"""
|
| 52 |
+
# Extract video ID by removing the timestamp and .txt extension
|
| 53 |
+
video_id = filename.split('_')[0]
|
| 54 |
+
return f"https://www.youtube.com/watch?v={video_id}"
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class RAGChatInterface:
|
| 58 |
+
def __init__(self, transcripts_folder_path: str, collection):
|
| 59 |
+
self.transcripts_folder_path = transcripts_folder_path
|
| 60 |
+
self.collection = collection
|
| 61 |
+
self.conversation_history: List[Dict[str, str]] = []
|
| 62 |
+
|
| 63 |
+
def process_query(self, message: str, history: List[List[str]]) -> str:
|
| 64 |
+
"""Process a single query and return the response"""
|
| 65 |
+
# Convert Gradio history format to our conversation history format
|
| 66 |
+
self.conversation_history = [
|
| 67 |
+
{"user": user_msg, "bot": bot_msg}
|
| 68 |
+
for user_msg, bot_msg in history
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
# Enhance query with conversation history
|
| 72 |
+
query_with_history = enhance_query_with_history(message, self.conversation_history)
|
| 73 |
+
|
| 74 |
+
# Get relevant documents
|
| 75 |
+
retrieved_docs, metadatas = query_database(self.collection, query_with_history)
|
| 76 |
+
|
| 77 |
+
if not retrieved_docs:
|
| 78 |
+
return "I apologize, but I couldn't find any relevant information about that in my knowledge base. Could you try rephrasing your question or ask about a different topic covered in the Huberman Lab Podcast?"
|
| 79 |
+
|
| 80 |
+
# Generate response
|
| 81 |
+
source_links = [meta["source"] for meta in metadatas]
|
| 82 |
+
response = generate_response(
|
| 83 |
+
self.conversation_history,
|
| 84 |
+
message,
|
| 85 |
+
retrieved_docs,
|
| 86 |
+
source_links
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Remove duplicate sources and convert to YouTube URLs
|
| 90 |
+
unique_sources = list(set(source_links))
|
| 91 |
+
youtube_urls = [format_youtube_url(source) for source in unique_sources]
|
| 92 |
+
|
| 93 |
+
# Format response with markdown for better readability
|
| 94 |
+
formatted_response = f"{response}\n\n---\n📚 **Source Episodes:**\n"
|
| 95 |
+
for url in youtube_urls:
|
| 96 |
+
formatted_response += f"- {url}\n"
|
| 97 |
+
|
| 98 |
+
return formatted_response
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def create_interface(transcripts_folder_path: str, collection) -> gr.Interface:
|
| 102 |
+
"""Create and configure the Gradio interface"""
|
| 103 |
+
# Initialize the RAG chat interface
|
| 104 |
+
rag_chat = RAGChatInterface(transcripts_folder_path, collection)
|
| 105 |
+
|
| 106 |
+
# Create the Gradio interface with custom styling
|
| 107 |
+
interface = gr.ChatInterface(
|
| 108 |
+
fn=rag_chat.process_query,
|
| 109 |
+
title="🧠 HubermanBot - Your Neuroscience & Wellness AI Assistant",
|
| 110 |
+
description=INTRODUCTION,
|
| 111 |
+
examples=[
|
| 112 |
+
"What are Dr. Huberman's top recommendations for better sleep?",
|
| 113 |
+
"How does sunlight exposure affect our circadian rhythm?",
|
| 114 |
+
"What supplements does Dr. Huberman recommend for focus?",
|
| 115 |
+
"What are the best practices for morning routines according to Dr. Huberman?",
|
| 116 |
+
"How can I optimize my workout recovery based on neuroscience?",
|
| 117 |
+
],
|
| 118 |
+
theme=gr.themes.Soft(
|
| 119 |
+
primary_hue="indigo",
|
| 120 |
+
secondary_hue="blue",
|
| 121 |
+
)
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
return interface
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def main():
|
| 128 |
+
# Get absolute path for ChromaDB
|
| 129 |
+
project_root = Path(__file__).parent.parent
|
| 130 |
+
chromadb_path = project_root / "Rag" / "chromadb.db"
|
| 131 |
+
|
| 132 |
+
client = chromadb.PersistentClient(path=str(chromadb_path))
|
| 133 |
+
collection = client.get_or_create_collection(name="yt_transcript_collection")
|
| 134 |
+
|
| 135 |
+
# Use absolute path for transcripts folder too
|
| 136 |
+
transcripts_folder_path = project_root / "Data" / "transcripts"
|
| 137 |
+
|
| 138 |
+
# Process any new files
|
| 139 |
+
process_and_add_new_files(str(transcripts_folder_path), collection)
|
| 140 |
+
|
| 141 |
+
# Create and launch the interface
|
| 142 |
+
interface = create_interface(str(transcripts_folder_path), collection)
|
| 143 |
+
interface.launch(share=True, server_port=7860)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
if __name__ == "__main__":
|
| 147 |
+
main()
|
utils/__init__.py
ADDED
|
File without changes
|
utils/corefrence.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spacy
|
| 2 |
+
from spacy.tokens import Doc
|
| 3 |
+
import coreferee
|
| 4 |
+
|
| 5 |
+
# Load spaCy model
|
| 6 |
+
nlp = spacy.load('en_core_web_sm')
|
| 7 |
+
nlp.add_pipe("coreferee")
|
| 8 |
+
|
| 9 |
+
# Register the custom extension attribute
|
| 10 |
+
Doc.set_extension('resolved_text', default=None, force=True)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def resolve_coreferences(query_text, conversation_history):
|
| 14 |
+
"""
|
| 15 |
+
Resolve coreferences in the given text using spaCy and coreferee.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
query_text (str): The current query to resolve
|
| 19 |
+
conversation_history (list): List of dictionaries containing previous conversation turns
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
str: Text with resolved coreferences
|
| 23 |
+
"""
|
| 24 |
+
# Combine conversation history and current query
|
| 25 |
+
combined_text = []
|
| 26 |
+
for turn in conversation_history:
|
| 27 |
+
combined_text.append(f"User: {turn['user']}")
|
| 28 |
+
combined_text.append(f"Bot: {turn['Bot']}")
|
| 29 |
+
combined_text.append(f"User: {query_text}")
|
| 30 |
+
text = "\n".join(combined_text)
|
| 31 |
+
|
| 32 |
+
# Process the text
|
| 33 |
+
doc = nlp(text)
|
| 34 |
+
|
| 35 |
+
# Get all tokens and their potential antecedents
|
| 36 |
+
resolved_tokens = list(doc)
|
| 37 |
+
|
| 38 |
+
# Resolve coreferences
|
| 39 |
+
for chain in doc._.coref_chains:
|
| 40 |
+
for mention in chain:
|
| 41 |
+
if mention.root_index != chain.most_specific.root_index:
|
| 42 |
+
# Replace mention with its antecedent
|
| 43 |
+
resolved_tokens[mention.root_index] = doc[chain.most_specific.root_index]
|
| 44 |
+
|
| 45 |
+
# Reconstruct the text with resolved references
|
| 46 |
+
resolved_text = "".join([token.text_with_ws if isinstance(token, spacy.tokens.Token)
|
| 47 |
+
else token.text + " " for token in resolved_tokens])
|
| 48 |
+
|
| 49 |
+
# Extract the resolved query (last line)
|
| 50 |
+
resolved_query = resolved_text.split('\n')[-1].replace("User: ", "").strip()
|
| 51 |
+
|
| 52 |
+
return resolved_query
|
utils/get_link.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def get_source_link(metadatas):
|
| 2 |
+
link = 'https://www.youtube.com/watch?v='
|
| 3 |
+
yt_link = []
|
| 4 |
+
for metadata in metadatas:
|
| 5 |
+
source = metadata['source']
|
| 6 |
+
values = source.split('.txt')
|
| 7 |
+
|
| 8 |
+
link = link + values[0]
|
| 9 |
+
yt_link.append(link)
|
| 10 |
+
# print(yt_link)
|
| 11 |
+
return yt_link
|
utils/summarization.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Llm.llm_endpoints import get_llm_response
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def summarize_conversation(conversation_history):
|
| 5 |
+
try:
|
| 6 |
+
summary_prompt = "Summarize the following conversation:\n" + "\n".join(
|
| 7 |
+
[f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
| 8 |
+
summary = get_llm_response(summary_prompt)
|
| 9 |
+
print("*************************************************")
|
| 10 |
+
print(summary)
|
| 11 |
+
print("*************************************************")
|
| 12 |
+
return summary
|
| 13 |
+
except:
|
| 14 |
+
return ""
|