Spaces:
Sleeping
Sleeping
Update functions.py
Browse files- functions.py +113 -2
functions.py
CHANGED
|
@@ -88,8 +88,119 @@ def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type=
|
|
| 88 |
|
| 89 |
else:
|
| 90 |
st.write("Vector store doesnt exist and will be created now")
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
|
| 95 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
|
|
|
| 88 |
|
| 89 |
else:
|
| 90 |
st.write("Vector store doesnt exist and will be created now")
|
| 91 |
+
urls = [
|
| 92 |
+
|
| 93 |
+
"https://github.com/zedr/clean-code-python",
|
| 94 |
+
"https://tenthousandmeters.com/blog/python-behind-the-scenes-10-how-python-dictionaries-work/",
|
| 95 |
+
"https://realpython.com/python-testing/",
|
| 96 |
+
"https://docs.python-guide.org/writing/license/",
|
| 97 |
+
"https://blogs.nvidia.com/blog/what-is-a-transformer-model/",
|
| 98 |
+
"https://research.google/blog/transformer-a-novel-neural-network-architecture-for-language-understanding/",
|
| 99 |
+
"https://realpython.com/python-pep8/",
|
| 100 |
+
"https://towardsdatascience.com/ideal-python-environment-setup-for-data-science-cdb03a447de8",
|
| 101 |
+
"https://realpython.com/python3-object-oriented-programming/",
|
| 102 |
+
"https://realpython.com/python-functional-programming/",
|
| 103 |
+
"https://fivethirtyeight.com/features/science-isnt-broken/",
|
| 104 |
+
"https://github.com/renatofillinich/ab_test_guide_in_python/blob/master/AB%20testing%20with%20Python.ipynb",
|
| 105 |
+
"https://towardsdatascience.com/why-is-data-science-failing-to-solve-the-right-problems-7b5b6121e3b4",
|
| 106 |
+
"https://medium.com/@srowen/common-probability-distributions-347e6b945ce4",
|
| 107 |
+
"https://github.com/renatofillinich/ab_test_guide_in_python/blob/master/AB%20testing%20with%20Python.ipynb",
|
| 108 |
+
"https://scikit-learn.org/stable/modules/compose.html",
|
| 109 |
+
"https://machinelearningmastery.com/light-gradient-boosted-machine-lightgbm-ensemble/",
|
| 110 |
+
"https://neptune.ai/blog/xgboost-vs-lightgbm",
|
| 111 |
+
"https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27",
|
| 112 |
+
"https://www.cio.com/article/247005/what-are-containers-and-why-do-you-need-them.html",
|
| 113 |
+
"https://mitsloan.mit.edu/ideas-made-to-matter/machine-learning-explained",
|
| 114 |
+
"https://towardsdatascience.com/making-friends-with-machine-learning-5e28d5205a29",
|
| 115 |
+
"https://towardsdatascience.com/handling-imbalanced-datasets-in-machine-learning-7a0e84220f28",
|
| 116 |
+
"https://machinelearningmastery.com/multi-class-imbalanced-classification/",
|
| 117 |
+
"https://imbalanced-learn.org/stable/auto_examples/applications/plot_impact_imbalanced_classes.html",
|
| 118 |
+
"https://docs.ray.io/en/master/tune/examples/tune-sklearn.html",
|
| 119 |
+
"https://www.kaggle.com/code/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy",
|
| 120 |
+
"https://cs231n.github.io/optimization-2/",
|
| 121 |
+
"https://alexander-schiendorfer.github.io/2020/02/24/a-worked-example-of-backprop.html",
|
| 122 |
+
"https://www.analyticsvidhya.com/blog/2020/01/fundamentals-deep-learning-activation-functions-when-to-use-them/",
|
| 123 |
+
"https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html",
|
| 124 |
+
"https://d2l.ai/chapter_multilayer-perceptrons/mlp.html",
|
| 125 |
+
"https://d2l.ai/chapter_linear-classification/softmax-regression.html#loss-function",
|
| 126 |
+
"https://d2l.ai/chapter_optimization/",
|
| 127 |
+
"https://www.investopedia.com/terms/s/statistical-significance.asp",
|
| 128 |
+
"https://d2l.ai/chapter_linear-classification/softmax-regression.html#loss-function",
|
| 129 |
+
"https://d2l.ai/chapter_convolutional-neural-networks/why-conv.html",
|
| 130 |
+
"https://d2l.ai/chapter_convolutional-modern/alexnet.html",
|
| 131 |
+
"https://d2l.ai/chapter_convolutional-modern/vgg.html",
|
| 132 |
+
"https://d2l.ai/chapter_convolutional-modern/nin.html",
|
| 133 |
+
"https://d2l.ai/chapter_convolutional-modern/googlenet.html",
|
| 134 |
+
'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/',
|
| 135 |
+
'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/string/',
|
| 136 |
+
'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/comparison/',
|
| 137 |
+
'https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/trajectory/',
|
| 138 |
+
"https://langchain-ai.github.io/langgraph/concepts/high_level/#why-langgraph",
|
| 139 |
+
'https://langchain-ai.github.io/langgraph/concepts/low_level/#only-stream-tokens-from-specific-nodesllms',
|
| 140 |
+
"https://langchain-ai.github.io/langgraph/concepts/agentic_concepts/#reflection",
|
| 141 |
+
"https://langchain-ai.github.io/langgraph/concepts/faq/",
|
| 142 |
+
"https://www.geeksforgeeks.org/python-oops-concepts/",
|
| 143 |
+
"https://www.mckinsey.com/featured-insights/mckinsey-explainers/what-is-fintech",
|
| 144 |
+
"https://datascientest.com/en/adversarial-attack-definition-and-protection-against-this-threat",
|
| 145 |
+
"https://datascientest.com/en/all-about-dspy",
|
| 146 |
+
"https://datascientest.com/en/arithmetic-and-data-science",
|
| 147 |
+
"https://datascientest.com/en/all-about-machine-learning-metrics",
|
| 148 |
+
"https://datascientest.com/en/all-about-procedural-programming",
|
| 149 |
+
"https://datascientest.com/en/all-about-cryptography",
|
| 150 |
+
"https://datascientest.com/en/all-about-predictive-coding",
|
| 151 |
+
"https://datascientest.com/en/all-about-network-convergence",
|
| 152 |
+
"https://datascientest.com/en/all-about-forensic-analysis",
|
| 153 |
+
"https://datascientest.com/en/all-about-chatgpt-jailbreak",
|
| 154 |
+
"https://datascientest.com/en/all-about-pentest",
|
| 155 |
+
"https://datascientest.com/en/all-about-embedded-systems",
|
| 156 |
+
"https://datascientest.com/en/all-about-network-operating-system",
|
| 157 |
+
"https://datascientest.com/en/all-about-ai-and-cybersecurity",
|
| 158 |
+
"https://datascientest.com/en/all-about-cybernetics",
|
| 159 |
+
"https://datascientest.com/en/all-about-seo",
|
| 160 |
+
"https://datascientest.com/en/all-about-expert-system",
|
| 161 |
+
"https://datascientest.com/en/all-about-telecommunications",
|
| 162 |
+
"https://datascientest.com/en/all-about-smart-cities",
|
| 163 |
+
"https://datascientest.com/en/all-about-artificial-intelligence-and-finance-sector",
|
| 164 |
+
"https://datascientest.com/en/all-about-generated-pre-trained-transformers",
|
| 165 |
+
"https://datascientest.com/en/all-about-iso-27001",
|
| 166 |
+
"https://datascientest.com/en/all-about-smart-sensors",
|
| 167 |
+
"https://datascientest.com/en/all-about-virtual-networks",
|
| 168 |
+
"https://datascientest.com/en/all-about-ethical-ai",
|
| 169 |
+
"https://datascientest.com/en/all-about-saio",
|
| 170 |
+
"https://datascientest.com/en/all-about-recommendation-algorithm",
|
| 171 |
+
"https://www.geeksforgeeks.org/activation-functions-neural-networks/",
|
| 172 |
+
"https://www.geeksforgeeks.org/activation-functions-in-neural-networks-set2/?ref=oin_asr1",
|
| 173 |
+
"https://www.geeksforgeeks.org/choosing-the-right-activation-function-for-your-neural-network/?ref=oin_asr3",
|
| 174 |
+
"https://www.geeksforgeeks.org/difference-between-feed-forward-neural-networks-and-recurrent-neural-networks/?ref=oin_asr2",
|
| 175 |
+
"https://www.geeksforgeeks.org/recurrent-neural-networks-explanation/?ref=oin_asr11",
|
| 176 |
+
"https://www.geeksforgeeks.org/deeppose-human-pose-estimation-via-deep-neural-networks/?ref=oin_asr13",
|
| 177 |
+
"https://www.geeksforgeeks.org/auto-associative-neural-networks/?ref=oin_asr18",
|
| 178 |
+
"https://www.geeksforgeeks.org/what-are-graph-neural-networks/?ref=oin_asr30",
|
| 179 |
+
"https://hdsr.mitpress.mit.edu/pub/la3vitqm/release/2",
|
| 180 |
+
"https://datasciencedojo.com/blog/a-guide-to-large-language-models/",
|
| 181 |
+
"https://datasciencedojo.com/blog/bootstrap-sampling/",
|
| 182 |
+
"https://datasciencedojo.com/blog/top-statistical-concepts/",
|
| 183 |
+
"https://datasciencedojo.com/blog/probability-for-data-science/",
|
| 184 |
+
"https://datasciencedojo.com/blog/top-statistical-techniques/",
|
| 185 |
+
"https://datasciencedojo.com/blog/statistical-distributions/",
|
| 186 |
+
"https://datasciencedojo.com/blog/data-science-in-finance/",
|
| 187 |
+
"https://datasciencedojo.com/blog/random-forest-algorithm/",
|
| 188 |
+
"https://datasciencedojo.com/blog/gini-index-and-entropy/",
|
| 189 |
+
"https://datasciencedojo.com/blog/boosting-algorithms-in-machine-learning/",
|
| 190 |
+
"https://datasciencedojo.com/blog/ensemble-methods-in-machine-learning/",
|
| 191 |
+
"https://datasciencedojo.com/blog/langgraph-tutorial/",
|
| 192 |
+
"https://datasciencedojo.com/blog/data-driven-marketing-in-2024/",
|
| 193 |
+
"https://datasciencedojo.com/blog/on-device-ai/",
|
| 194 |
+
|
| 195 |
+
def extract_sentences_from_web(links, chunk_size=500, chunk_overlap=30):
|
| 196 |
+
data = []
|
| 197 |
+
for link in links:
|
| 198 |
+
loader = NewsURLLoader(urls=[link])
|
| 199 |
+
data += loader.load()
|
| 200 |
+
return data
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
docs = extract_sentences_from_web(links=urls)
|
| 204 |
|
| 205 |
|
| 206 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|