Spaces:
Running
Running
project upload
Browse files- .gitattributes +4 -34
- .gitignore +4 -0
- Home.py +162 -0
- README.md +9 -7
- requirements.txt +57 -0
- src/FAISS/FAISS.ipynb +390 -0
- src/chatbot.py +286 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,5 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.
|
| 23 |
-
*.
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.csv filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.faiss filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
src/FAISS/speeches_1949_09_12.faiss filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
src/FAISS/speeches_1949_09_12.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
hf_upload.py
|
| 3 |
+
.env
|
| 4 |
+
.mypy_cache
|
Home.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import snapshot_download
|
| 2 |
+
# Download legislature vectordatabase
|
| 3 |
+
REPO_ID = "TomData/speeches-of-the-german-parliament"
|
| 4 |
+
LOCAL_DIR = "src/FAISS"
|
| 5 |
+
snapshot_download(repo_id=REPO_ID, local_dir=LOCAL_DIR, repo_type="dataset")
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
#from gradio_calendar import Calendar
|
| 9 |
+
#from datetime import datetime
|
| 10 |
+
from src.chatbot import chatbot, keyword_search
|
| 11 |
+
|
| 12 |
+
# Only required when running locally
|
| 13 |
+
# import os
|
| 14 |
+
# from dotenv import load_dotenv
|
| 15 |
+
# from huggingface_hub import login
|
| 16 |
+
# load_dotenv(dotenv_path=".env")
|
| 17 |
+
# login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN")) # Your token here
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Define important variables
|
| 21 |
+
legislature_periods = [
|
| 22 |
+
"All",
|
| 23 |
+
"20. Legislaturperiode",
|
| 24 |
+
"19. Legislaturperiode",
|
| 25 |
+
"18. Legislaturperiode",
|
| 26 |
+
"17. Legislaturperiode",
|
| 27 |
+
"16. Legislaturperiode",
|
| 28 |
+
"15. Legislaturperiode",
|
| 29 |
+
"14. Legislaturperiode",
|
| 30 |
+
"13. Legislaturperiode",
|
| 31 |
+
"12. Legislaturperiode",
|
| 32 |
+
"11. Legislaturperiode",
|
| 33 |
+
"10. Legislaturperiode",
|
| 34 |
+
"9. Legislaturperiode",
|
| 35 |
+
"8. Legislaturperiode",
|
| 36 |
+
"7. Legislaturperiode",
|
| 37 |
+
"6. Legislaturperiode",
|
| 38 |
+
"5. Legislaturperiode",
|
| 39 |
+
"4. Legislaturperiode",
|
| 40 |
+
"3. Legislaturperiode",
|
| 41 |
+
"2. Legislaturperiode",
|
| 42 |
+
"1. Legislaturperiode"
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
partys = ['All','CDU/CSU','SPD','AfD','Grüne','FDP','DIE LINKE.','GB/BHE','DRP', 'WAV', 'NR', 'BP', 'FU', 'SSW', 'KPD', 'DA', 'FVP','DP','Z', 'PDS','Fraktionslos','not found', 'Gast']
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# Define Gradio App Layout
|
| 49 |
+
with gr.Blocks() as App:
|
| 50 |
+
with gr.Tab("ChatBot"):
|
| 51 |
+
with gr.Blocks(fill_height=True):
|
| 52 |
+
with gr.Accordion(open=False, label="Filter"):
|
| 53 |
+
# Apply RAG using chatbot function from file chatbot.py
|
| 54 |
+
db_inputs = gr.Dropdown(choices=legislature_periods, value="All", multiselect=True, label="Legislature", info="Select a combination of legislatures as basis for the chatbot's replies", show_label=True)
|
| 55 |
+
prompt_language = gr.Dropdown(choices=["DE", "EN"], value="DE",label="Language", info="Choose output language", multiselect=False)
|
| 56 |
+
|
| 57 |
+
gr.ChatInterface(chatbot,
|
| 58 |
+
title="PoliticsToYou",
|
| 59 |
+
description= "Ask anything about your favorite political topic from any legislature period",
|
| 60 |
+
examples=[
|
| 61 |
+
["Wie steht die CDU zur Cannabislegalisierung?", "All", "DE"],
|
| 62 |
+
["Wie steht die FDP zur Rente?", "All", "DE"],
|
| 63 |
+
["Was sagten die Parteien in der ersten Legislaturperiode über die nazi Vergangenheit?", "1. Legislaturperiode", "DE"],
|
| 64 |
+
["Wie wird die Ehe für alle diskutiert?", "18. Legislaturperiode", "DE"],
|
| 65 |
+
["How is the GDR perceived?", "11. Legislaturperiode", "EN"]
|
| 66 |
+
],
|
| 67 |
+
cache_examples=True, #true increases loading time
|
| 68 |
+
additional_inputs = [db_inputs, prompt_language],
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
with gr.Tab("KeywordSearch"):
|
| 72 |
+
|
| 73 |
+
with gr.Blocks() as Block:
|
| 74 |
+
# Keyword Input
|
| 75 |
+
keyword_box = gr.Textbox(label='keyword')
|
| 76 |
+
|
| 77 |
+
# Additional Input (hidden)
|
| 78 |
+
with gr.Accordion('Filter', open=False):
|
| 79 |
+
# Row orientation
|
| 80 |
+
with gr.Row() as additional_input:
|
| 81 |
+
n_slider = gr.Slider(label="Number of Results",info="Other filters reduces the returned results", minimum=1, maximum=100, step=1, value=10)
|
| 82 |
+
party_dopdown = gr.Dropdown(value='All', choices=partys, label='Party')
|
| 83 |
+
# ToDo: Add date or legislature filter as input
|
| 84 |
+
#start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
|
| 85 |
+
#end_date = Calendar(value=datetime.today().strftime('%Y-%m-%d'), type="datetime", label="Select end date", info="Click the calendar icon to bring up the calendar.", interactive=True)
|
| 86 |
+
|
| 87 |
+
search_btn = gr.Button('Search')
|
| 88 |
+
|
| 89 |
+
with gr.Column(visible=False) as output_col:
|
| 90 |
+
results_df = gr.Dataframe(label='Results', interactive=False)
|
| 91 |
+
|
| 92 |
+
# Download results from keyword search
|
| 93 |
+
with gr.Accordion('Would you like to download your results?', open=False) as download_row:
|
| 94 |
+
with gr.Row():
|
| 95 |
+
ftype_dropdown = gr.Dropdown(choices=["csv","excel","json"], label="Format")
|
| 96 |
+
export_btn = gr.Button('Export')
|
| 97 |
+
file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
|
| 98 |
+
|
| 99 |
+
# Keyword Search on click
|
| 100 |
+
def search(keyword, n, party): # ToDo: Include party and timedate
|
| 101 |
+
return {
|
| 102 |
+
output_col: gr.Column(visible=True),
|
| 103 |
+
results_df: keyword_search(query=keyword, n=n, party_filter=party),
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
search_btn.click(
|
| 107 |
+
fn=search,
|
| 108 |
+
inputs=[keyword_box, n_slider, party_dopdown],
|
| 109 |
+
outputs=[output_col, results_df],
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# Export data to a downloadable format
|
| 113 |
+
def export(df, keyword, ftype=None):
|
| 114 |
+
if ftype == "csv":
|
| 115 |
+
file = f'{keyword}.csv'
|
| 116 |
+
df.to_csv(file, index = False)
|
| 117 |
+
return gr.File(value=file,visible=True)
|
| 118 |
+
elif ftype == "json":
|
| 119 |
+
file = f'{keyword}.json'
|
| 120 |
+
df.to_json(file, index = True)
|
| 121 |
+
return gr.File(value=file,visible=True)
|
| 122 |
+
else:
|
| 123 |
+
file = f'{keyword}.xlsx'
|
| 124 |
+
df.to_excel(file, index = True)
|
| 125 |
+
return gr.File(value=file,visible=True)
|
| 126 |
+
|
| 127 |
+
export_btn.click(
|
| 128 |
+
fn=export,
|
| 129 |
+
inputs=[results_df, keyword_box, ftype_dropdown],
|
| 130 |
+
outputs=[file],
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
with gr.Tab("About"):
|
| 134 |
+
gr.Markdown("""
|
| 135 |
+
<h2>Welcome to <strong>PoliticsToYou</strong> - your playground for investigating the heart of politics in Germany</h2>
|
| 136 |
+
<ul>
|
| 137 |
+
<p>Would you like to gain insights into political debates or reveal party positions on specific topics from any legislature?</p>
|
| 138 |
+
<br>
|
| 139 |
+
<p>You can use the ChatBot to ask all your questions or search for related speech content in the Keyword Search section.</p>
|
| 140 |
+
</ul>
|
| 141 |
+
<p>Looking forward to your feedback!</p>
|
| 142 |
+
|
| 143 |
+
<h3>Further improvements & Ideas:</h3>
|
| 144 |
+
<ul>
|
| 145 |
+
<li>Experiment with different LLMs and prompt templates</li>
|
| 146 |
+
<li>Include chat history</li>
|
| 147 |
+
<li>Add a legislature filter to KeywordSearch</li>
|
| 148 |
+
<li>Exclude short document splits</li>
|
| 149 |
+
<li>Improve inference time</li>
|
| 150 |
+
<li>Update the database every month with the latest content</li>
|
| 151 |
+
<li>Expand the scope to party manifestos and different countries</li>
|
| 152 |
+
</ul>
|
| 153 |
+
|
| 154 |
+
<p>Big thank you to the OpenDiscourse team for creating the underlying speeches corpus. Check out their website <a href="https://opendiscourse.de/">here</a>.</p>
|
| 155 |
+
|
| 156 |
+
"""
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
if __name__ == "__main__":
|
| 160 |
+
App.launch(share=False) # true not supported on hf spaces
|
| 161 |
+
|
| 162 |
+
|
README.md
CHANGED
|
@@ -1,12 +1,14 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
|
|
|
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: PoliticsToYou
|
| 3 |
+
emoji: 🏢
|
| 4 |
+
colorFrom: pink
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.26.0
|
| 8 |
+
python_version: 3.11
|
| 9 |
+
app_file: Home.py
|
| 10 |
pinned: false
|
| 11 |
+
short_description: Explore speeches from the German Bundestag since 1949.
|
| 12 |
---
|
| 13 |
|
| 14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
requirements.txt
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Network and Async
|
| 2 |
+
aiohttp>=3.11.0
|
| 3 |
+
aiosignal>=1.3.1
|
| 4 |
+
annotated-types>=0.7.0
|
| 5 |
+
anyio>=4.8.0
|
| 6 |
+
attrs>=25.1.0
|
| 7 |
+
certifi>=2025.1.0
|
| 8 |
+
charset-normalizer>=3.4.0
|
| 9 |
+
|
| 10 |
+
numpy==1.26.4
|
| 11 |
+
pandas<3
|
| 12 |
+
Pillow<11
|
| 13 |
+
MarkupSafe<3
|
| 14 |
+
pydantic==2.10.6
|
| 15 |
+
|
| 16 |
+
# Data Science and Processing
|
| 17 |
+
# numpy>=2.4.1
|
| 18 |
+
# pandas>=2.3.3
|
| 19 |
+
# Pillow>=12.1.0
|
| 20 |
+
openpyxl>=3.1.5
|
| 21 |
+
et-xmlfile>=2.0.0
|
| 22 |
+
mpmath>=1.3.0
|
| 23 |
+
|
| 24 |
+
# Fixed Core Frameworks
|
| 25 |
+
langchain>=1.0.0
|
| 26 |
+
langchain-classic>=1.0.0
|
| 27 |
+
langchain-community>=0.3.0
|
| 28 |
+
langchain-core>=1.0.0
|
| 29 |
+
langchain-huggingface>=1.0.0
|
| 30 |
+
langchain-text-splitters>=1.0.0
|
| 31 |
+
|
| 32 |
+
# Vector Search and AI Infrastructure
|
| 33 |
+
faiss-cpu>=1.13.2
|
| 34 |
+
huggingface-hub>=0.36.0
|
| 35 |
+
fsspec>=2025.1.0
|
| 36 |
+
filelock>=3.16.0
|
| 37 |
+
sentence-transformers>=3.3.0
|
| 38 |
+
|
| 39 |
+
# Serialization and Utilities
|
| 40 |
+
orjson>=3.10.12
|
| 41 |
+
jsonpatch>=1.33
|
| 42 |
+
jsonpointer>=3.0.0
|
| 43 |
+
marshmallow>=3.23.0
|
| 44 |
+
joblib>=1.4.2
|
| 45 |
+
packaging>=24.2
|
| 46 |
+
idna>=3.10
|
| 47 |
+
typing-extensions>=4.12.0
|
| 48 |
+
|
| 49 |
+
# Web and UI
|
| 50 |
+
# Jinja2>=3.1.5
|
| 51 |
+
# MarkupSafe>=3.0.0
|
| 52 |
+
|
| 53 |
+
# System and Performance
|
| 54 |
+
greenlet>=3.1.1
|
| 55 |
+
networkx>=3.4.2
|
| 56 |
+
frozenlist>=1.5.0
|
| 57 |
+
dataclasses-json>=0.6.7
|
src/FAISS/FAISS.ipynb
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import pandas as pd\n",
|
| 10 |
+
"import psycopg2\n",
|
| 11 |
+
"\n",
|
| 12 |
+
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
| 13 |
+
"from langchain_community.document_loaders import DataFrameLoader\n",
|
| 14 |
+
"from langchain_community.embeddings import HuggingFaceEmbeddings\n",
|
| 15 |
+
"from langchain_community.vectorstores import FAISS\n",
|
| 16 |
+
"from datetime import datetime\n",
|
| 17 |
+
"\n"
|
| 18 |
+
]
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"cell_type": "markdown",
|
| 22 |
+
"metadata": {},
|
| 23 |
+
"source": [
|
| 24 |
+
"### Retrieve Speeches"
|
| 25 |
+
]
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"cell_type": "code",
|
| 29 |
+
"execution_count": null,
|
| 30 |
+
"metadata": {},
|
| 31 |
+
"outputs": [],
|
| 32 |
+
"source": [
|
| 33 |
+
"# db_connection -----------------------------------------------------------\n",
|
| 34 |
+
"con_details = {\n",
|
| 35 |
+
" \"host\" : \"localhost\",\n",
|
| 36 |
+
" \"database\" : \"next\",\n",
|
| 37 |
+
" \"user\" : \"postgres\",\n",
|
| 38 |
+
" \"password\" : \"postgres\",\n",
|
| 39 |
+
" \"port\" : \"5433\"\n",
|
| 40 |
+
"}\n",
|
| 41 |
+
"con = psycopg2.connect(**con_details)\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"# get data tables ---------------------------------------------------------\n",
|
| 44 |
+
"df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n",
|
| 45 |
+
" FROM open_discourse.speeches AS s\n",
|
| 46 |
+
" INNER JOIN open_discourse.factions AS f ON\n",
|
| 47 |
+
" s.faction_id = f.id;\"\"\", con)"
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"cell_type": "markdown",
|
| 52 |
+
"metadata": {},
|
| 53 |
+
"source": [
|
| 54 |
+
"### Process speeches"
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"cell_type": "code",
|
| 59 |
+
"execution_count": null,
|
| 60 |
+
"metadata": {},
|
| 61 |
+
"outputs": [],
|
| 62 |
+
"source": [
|
| 63 |
+
"print(set(df['party'].to_list()))"
|
| 64 |
+
]
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"cell_type": "code",
|
| 68 |
+
"execution_count": null,
|
| 69 |
+
"metadata": {},
|
| 70 |
+
"outputs": [],
|
| 71 |
+
"source": [
|
| 72 |
+
"# Removing keys from interruptions of a speech\n",
|
| 73 |
+
"df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) \n",
|
| 74 |
+
"df['date'] = pd.to_datetime(df['date'])\n",
|
| 75 |
+
"df"
|
| 76 |
+
]
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"cell_type": "code",
|
| 80 |
+
"execution_count": null,
|
| 81 |
+
"metadata": {},
|
| 82 |
+
"outputs": [
|
| 83 |
+
{
|
| 84 |
+
"data": {
|
| 85 |
+
"text/html": [
|
| 86 |
+
"<div>\n",
|
| 87 |
+
"<style scoped>\n",
|
| 88 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 89 |
+
" vertical-align: middle;\n",
|
| 90 |
+
" }\n",
|
| 91 |
+
"\n",
|
| 92 |
+
" .dataframe tbody tr th {\n",
|
| 93 |
+
" vertical-align: top;\n",
|
| 94 |
+
" }\n",
|
| 95 |
+
"\n",
|
| 96 |
+
" .dataframe thead th {\n",
|
| 97 |
+
" text-align: right;\n",
|
| 98 |
+
" }\n",
|
| 99 |
+
"</style>\n",
|
| 100 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 101 |
+
" <thead>\n",
|
| 102 |
+
" <tr style=\"text-align: right;\">\n",
|
| 103 |
+
" <th></th>\n",
|
| 104 |
+
" <th>id</th>\n",
|
| 105 |
+
" <th>speech_content</th>\n",
|
| 106 |
+
" <th>date</th>\n",
|
| 107 |
+
" <th>party</th>\n",
|
| 108 |
+
" </tr>\n",
|
| 109 |
+
" </thead>\n",
|
| 110 |
+
" <tbody>\n",
|
| 111 |
+
" <tr>\n",
|
| 112 |
+
" <th>0</th>\n",
|
| 113 |
+
" <td>0</td>\n",
|
| 114 |
+
" <td>Meine Damen und Herren! Ich eröffne die 2. Sit...</td>\n",
|
| 115 |
+
" <td>1949-09-12</td>\n",
|
| 116 |
+
" <td>not found</td>\n",
|
| 117 |
+
" </tr>\n",
|
| 118 |
+
" <tr>\n",
|
| 119 |
+
" <th>1</th>\n",
|
| 120 |
+
" <td>1</td>\n",
|
| 121 |
+
" <td>Der Bundesrat ist versammelt, Herr Präsident.\\n</td>\n",
|
| 122 |
+
" <td>1949-09-12</td>\n",
|
| 123 |
+
" <td>not found</td>\n",
|
| 124 |
+
" </tr>\n",
|
| 125 |
+
" <tr>\n",
|
| 126 |
+
" <th>2</th>\n",
|
| 127 |
+
" <td>2</td>\n",
|
| 128 |
+
" <td>Ich danke für diese Erklärung. Ich stelle dami...</td>\n",
|
| 129 |
+
" <td>1949-09-12</td>\n",
|
| 130 |
+
" <td>not found</td>\n",
|
| 131 |
+
" </tr>\n",
|
| 132 |
+
" <tr>\n",
|
| 133 |
+
" <th>3</th>\n",
|
| 134 |
+
" <td>3</td>\n",
|
| 135 |
+
" <td>Ja, ich habe den Wunsch.\\n</td>\n",
|
| 136 |
+
" <td>1949-09-12</td>\n",
|
| 137 |
+
" <td>not found</td>\n",
|
| 138 |
+
" </tr>\n",
|
| 139 |
+
" <tr>\n",
|
| 140 |
+
" <th>4</th>\n",
|
| 141 |
+
" <td>4</td>\n",
|
| 142 |
+
" <td>Ich erteile dem Herrn Bundespräsidenten das Wo...</td>\n",
|
| 143 |
+
" <td>1949-09-12</td>\n",
|
| 144 |
+
" <td>not found</td>\n",
|
| 145 |
+
" </tr>\n",
|
| 146 |
+
" <tr>\n",
|
| 147 |
+
" <th>...</th>\n",
|
| 148 |
+
" <td>...</td>\n",
|
| 149 |
+
" <td>...</td>\n",
|
| 150 |
+
" <td>...</td>\n",
|
| 151 |
+
" <td>...</td>\n",
|
| 152 |
+
" </tr>\n",
|
| 153 |
+
" <tr>\n",
|
| 154 |
+
" <th>930955</th>\n",
|
| 155 |
+
" <td>1084268</td>\n",
|
| 156 |
+
" <td>\\n\\nWir sind zwar Kollegen.</td>\n",
|
| 157 |
+
" <td>2022-12-16</td>\n",
|
| 158 |
+
" <td>not found</td>\n",
|
| 159 |
+
" </tr>\n",
|
| 160 |
+
" <tr>\n",
|
| 161 |
+
" <th>930956</th>\n",
|
| 162 |
+
" <td>1084269</td>\n",
|
| 163 |
+
" <td>\\n\\nLiebe, sehr geehrte Frau Präsidentin!</td>\n",
|
| 164 |
+
" <td>2022-12-16</td>\n",
|
| 165 |
+
" <td>CDU/CSU</td>\n",
|
| 166 |
+
" </tr>\n",
|
| 167 |
+
" <tr>\n",
|
| 168 |
+
" <th>930957</th>\n",
|
| 169 |
+
" <td>1084270</td>\n",
|
| 170 |
+
" <td>\\n\\nVielen Dank.</td>\n",
|
| 171 |
+
" <td>2022-12-16</td>\n",
|
| 172 |
+
" <td>not found</td>\n",
|
| 173 |
+
" </tr>\n",
|
| 174 |
+
" <tr>\n",
|
| 175 |
+
" <th>930958</th>\n",
|
| 176 |
+
" <td>1084272</td>\n",
|
| 177 |
+
" <td>\\n\\nDen Abschluss dieser Aktuellen Stunde bild...</td>\n",
|
| 178 |
+
" <td>2022-12-16</td>\n",
|
| 179 |
+
" <td>not found</td>\n",
|
| 180 |
+
" </tr>\n",
|
| 181 |
+
" <tr>\n",
|
| 182 |
+
" <th>930959</th>\n",
|
| 183 |
+
" <td>1084273</td>\n",
|
| 184 |
+
" <td>\\n\\nSehr geehrte Frau Präsidentin! Werte Kolle...</td>\n",
|
| 185 |
+
" <td>2022-12-16</td>\n",
|
| 186 |
+
" <td>SPD</td>\n",
|
| 187 |
+
" </tr>\n",
|
| 188 |
+
" </tbody>\n",
|
| 189 |
+
"</table>\n",
|
| 190 |
+
"<p>930960 rows × 4 columns</p>\n",
|
| 191 |
+
"</div>"
|
| 192 |
+
],
|
| 193 |
+
"text/plain": [
|
| 194 |
+
" id speech_content date \\\n",
|
| 195 |
+
"0 0 Meine Damen und Herren! Ich eröffne die 2. Sit... 1949-09-12 \n",
|
| 196 |
+
"1 1 Der Bundesrat ist versammelt, Herr Präsident.\\n 1949-09-12 \n",
|
| 197 |
+
"2 2 Ich danke für diese Erklärung. Ich stelle dami... 1949-09-12 \n",
|
| 198 |
+
"3 3 Ja, ich habe den Wunsch.\\n 1949-09-12 \n",
|
| 199 |
+
"4 4 Ich erteile dem Herrn Bundespräsidenten das Wo... 1949-09-12 \n",
|
| 200 |
+
"... ... ... ... \n",
|
| 201 |
+
"930955 1084268 \\n\\nWir sind zwar Kollegen. 2022-12-16 \n",
|
| 202 |
+
"930956 1084269 \\n\\nLiebe, sehr geehrte Frau Präsidentin! 2022-12-16 \n",
|
| 203 |
+
"930957 1084270 \\n\\nVielen Dank. 2022-12-16 \n",
|
| 204 |
+
"930958 1084272 \\n\\nDen Abschluss dieser Aktuellen Stunde bild... 2022-12-16 \n",
|
| 205 |
+
"930959 1084273 \\n\\nSehr geehrte Frau Präsidentin! Werte Kolle... 2022-12-16 \n",
|
| 206 |
+
"\n",
|
| 207 |
+
" party \n",
|
| 208 |
+
"0 not found \n",
|
| 209 |
+
"1 not found \n",
|
| 210 |
+
"2 not found \n",
|
| 211 |
+
"3 not found \n",
|
| 212 |
+
"4 not found \n",
|
| 213 |
+
"... ... \n",
|
| 214 |
+
"930955 not found \n",
|
| 215 |
+
"930956 CDU/CSU \n",
|
| 216 |
+
"930957 not found \n",
|
| 217 |
+
"930958 not found \n",
|
| 218 |
+
"930959 SPD \n",
|
| 219 |
+
"\n",
|
| 220 |
+
"[930960 rows x 4 columns]"
|
| 221 |
+
]
|
| 222 |
+
},
|
| 223 |
+
"execution_count": 3,
|
| 224 |
+
"metadata": {},
|
| 225 |
+
"output_type": "execute_result"
|
| 226 |
+
}
|
| 227 |
+
],
|
| 228 |
+
"source": [
|
| 229 |
+
"# Convert to proper time format\n",
|
| 230 |
+
"df['date'] = pd.to_datetime(df['date'])\n"
|
| 231 |
+
]
|
| 232 |
+
},
|
| 233 |
+
{
|
| 234 |
+
"cell_type": "code",
|
| 235 |
+
"execution_count": 27,
|
| 236 |
+
"metadata": {},
|
| 237 |
+
"outputs": [],
|
| 238 |
+
"source": [
|
| 239 |
+
"def split_documents(df, min_chunk_size=100):\n",
|
| 240 |
+
" \"\"\"\n",
|
| 241 |
+
" Load documents from a DataFrame, split them into smaller chunks for vector storage and remove chunks of small size.\n",
|
| 242 |
+
"\n",
|
| 243 |
+
" Parameters\n",
|
| 244 |
+
" ----------\n",
|
| 245 |
+
" df : pandas.DataFrame\n",
|
| 246 |
+
" A DataFrame containing the documents to be processed, with a column named 'speech_content'.\n",
|
| 247 |
+
" min_chunk_size : int, optional\n",
|
| 248 |
+
" Minimum number of characters a chunk must have to be included in the result. Default is 100.\n",
|
| 249 |
+
"\n",
|
| 250 |
+
" Returns\n",
|
| 251 |
+
" -------\n",
|
| 252 |
+
" list\n",
|
| 253 |
+
" A list of split document chunks ready for further processing or vectorization.\n",
|
| 254 |
+
" \"\"\"\n",
|
| 255 |
+
" # Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load\n",
|
| 256 |
+
" loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')\n",
|
| 257 |
+
" # Load the data from the DataFrame into a suitable format for processing\n",
|
| 258 |
+
" data = loader.load()\n",
|
| 259 |
+
" # Initialize a RecursiveCharacterTextSplitter to split the text into chunks\n",
|
| 260 |
+
" splitter = RecursiveCharacterTextSplitter(\n",
|
| 261 |
+
" chunk_size=1024,\n",
|
| 262 |
+
" chunk_overlap=32,\n",
|
| 263 |
+
" length_function=len,\n",
|
| 264 |
+
" is_separator_regex=False,\n",
|
| 265 |
+
" )\n",
|
| 266 |
+
" # Split the loaded data into smaller chunks using the splitter\n",
|
| 267 |
+
" documents = splitter.split_documents(documents=data)\n",
|
| 268 |
+
" # Discard small chunks below the threshold\n",
|
| 269 |
+
" cleaned_documents = [doc for doc in documents if len(doc.page_content) >= min_chunk_size]\n",
|
| 270 |
+
"\n",
|
| 271 |
+
" return cleaned_documents"
|
| 272 |
+
]
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"cell_type": "code",
|
| 276 |
+
"execution_count": null,
|
| 277 |
+
"metadata": {},
|
| 278 |
+
"outputs": [
|
| 279 |
+
{
|
| 280 |
+
"name": "stderr",
|
| 281 |
+
"output_type": "stream",
|
| 282 |
+
"text": [
|
| 283 |
+
"c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
| 284 |
+
" warnings.warn(\n"
|
| 285 |
+
]
|
| 286 |
+
},
|
| 287 |
+
{
|
| 288 |
+
"name": "stdout",
|
| 289 |
+
"output_type": "stream",
|
| 290 |
+
"text": [
|
| 291 |
+
"Sucessfully created vector store for 1. legislature\n",
|
| 292 |
+
"Sucessfully created vector store for 2. legislature\n",
|
| 293 |
+
"Sucessfully created vector store for 3. legislature\n",
|
| 294 |
+
"Sucessfully created vector store for 4. legislature\n",
|
| 295 |
+
"Sucessfully created vector store for 5. legislature\n",
|
| 296 |
+
"Sucessfully created vector store for 6. legislature\n",
|
| 297 |
+
"Sucessfully created vector store for 7. legislature\n",
|
| 298 |
+
"Sucessfully created vector store for 8. legislature\n",
|
| 299 |
+
"Sucessfully created vector store for 9. legislature\n",
|
| 300 |
+
"Sucessfully created vector store for 10. legislature\n",
|
| 301 |
+
"Sucessfully created vector store for 11. legislature\n",
|
| 302 |
+
"Sucessfully created vector store for 12. legislature\n",
|
| 303 |
+
"Sucessfully created vector store for 13. legislature\n",
|
| 304 |
+
"Sucessfully created vector store for 14. legislature\n",
|
| 305 |
+
"Sucessfully created vector store for 15. legislature\n",
|
| 306 |
+
"Sucessfully created vector store for 16. legislature\n",
|
| 307 |
+
"Sucessfully created vector store for 17. legislature\n",
|
| 308 |
+
"Sucessfully created vector store for 18. legislature\n",
|
| 309 |
+
"Sucessfully created vector store for 19. legislature\n",
|
| 310 |
+
"Sucessfully created vector store for 20. legislature\n"
|
| 311 |
+
]
|
| 312 |
+
}
|
| 313 |
+
],
|
| 314 |
+
"source": [
|
| 315 |
+
"# Define starting dates of legislature periods\n",
|
| 316 |
+
"dates = [\"1953-10-06\", \"1957-10-16\", \"1961-10-17\", \"1965-10-19\", \"1969-10-20\", \"1972-12-13\", \"1976-12-14\", \"1980-11-04\", \"1983-03-29\", \"1987-02-18\",\"1990-12-20\", \"1994-11-10\", \"1998-10-26\", \"2002-10-17\", \"2005-10-18\", \"2009-10-27\", \"2013-10-22\",\"2017-10-24\",\"2021-10-26\", None]\n",
|
| 317 |
+
"# Load sentence transformer \n",
|
| 318 |
+
"embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
|
| 319 |
+
"\n",
|
| 320 |
+
"# Create vector store for all speaches\n",
|
| 321 |
+
"# Split text into documents for vectorstore\n",
|
| 322 |
+
"documents = split_documents(df)\n",
|
| 323 |
+
"# Create and save faiss vectorstorage\n",
|
| 324 |
+
"index_name = 'speeches_1949_09_12'\n",
|
| 325 |
+
"db = FAISS.from_documents(documents, embeddings)\n",
|
| 326 |
+
"db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
|
| 327 |
+
"print(\"Sucessfully created vector store for all legislature\")\n",
|
| 328 |
+
"\n",
|
| 329 |
+
"# Create vector store for each legislature\n",
|
| 330 |
+
"# loop parameters\n",
|
| 331 |
+
"period = 1\n",
|
| 332 |
+
"previous_date = None\n",
|
| 333 |
+
"\n",
|
| 334 |
+
"# Iterate over all date to split by legislature getting vector stores for each period\n",
|
| 335 |
+
"for date in dates:\n",
|
| 336 |
+
" if previous_date is None:\n",
|
| 337 |
+
" legislature_df = df.loc[df['date'] < datetime.strptime(date, \"%Y-%m-%d\")]\n",
|
| 338 |
+
" elif date is None:\n",
|
| 339 |
+
" legislature_df = df.loc[df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")]\n",
|
| 340 |
+
" else:\n",
|
| 341 |
+
" legislature_df = df.loc[(df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")) & (df['date'] < datetime.strptime(date, \"%Y-%m-%d\"))]\n",
|
| 342 |
+
"\n",
|
| 343 |
+
" \n",
|
| 344 |
+
" # Split text into documents for vectorstore\n",
|
| 345 |
+
" documents = split_documents(legislature_df)\n",
|
| 346 |
+
"\n",
|
| 347 |
+
" # Create and save faiss vectorstorage\n",
|
| 348 |
+
" index_name = f'{period}_legislature'\n",
|
| 349 |
+
" db = FAISS.from_documents(documents, embeddings)\n",
|
| 350 |
+
" db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
|
| 351 |
+
" print(f\"Sucessfully created vector store for {period}. legislature\")\n",
|
| 352 |
+
"\n",
|
| 353 |
+
" # Change loop parameters for next iteration\n",
|
| 354 |
+
" period += 1\n",
|
| 355 |
+
" previous_date = date\n",
|
| 356 |
+
"\n",
|
| 357 |
+
"\n",
|
| 358 |
+
" \n"
|
| 359 |
+
]
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"cell_type": "markdown",
|
| 363 |
+
"metadata": {},
|
| 364 |
+
"source": [
|
| 365 |
+
"This data has been uploaded to: https://huggingface.co/datasets/TomData/speeches-of-the-german-parliament"
|
| 366 |
+
]
|
| 367 |
+
}
|
| 368 |
+
],
|
| 369 |
+
"metadata": {
|
| 370 |
+
"kernelspec": {
|
| 371 |
+
"display_name": "Python 3",
|
| 372 |
+
"language": "python",
|
| 373 |
+
"name": "python3"
|
| 374 |
+
},
|
| 375 |
+
"language_info": {
|
| 376 |
+
"codemirror_mode": {
|
| 377 |
+
"name": "ipython",
|
| 378 |
+
"version": 3
|
| 379 |
+
},
|
| 380 |
+
"file_extension": ".py",
|
| 381 |
+
"mimetype": "text/x-python",
|
| 382 |
+
"name": "python",
|
| 383 |
+
"nbconvert_exporter": "python",
|
| 384 |
+
"pygments_lexer": "ipython3",
|
| 385 |
+
"version": "3.11.4"
|
| 386 |
+
}
|
| 387 |
+
},
|
| 388 |
+
"nbformat": 4,
|
| 389 |
+
"nbformat_minor": 2
|
| 390 |
+
}
|
src/chatbot.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 2 |
+
from langchain_huggingface import HuggingFaceEndpoint
|
| 3 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 4 |
+
from langchain_community.vectorstores import FAISS
|
| 5 |
+
from langchain_huggingface import ChatHuggingFace
|
| 6 |
+
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
|
| 7 |
+
from langchain_classic.chains import create_retrieval_chain
|
| 8 |
+
from langchain_community.docstore.in_memory import InMemoryDocstore
|
| 9 |
+
from faiss import IndexFlatL2
|
| 10 |
+
import pandas as pd
|
| 11 |
+
# Load environmental variables from .env-file
|
| 12 |
+
from dotenv import load_dotenv, find_dotenv
|
| 13 |
+
load_dotenv(find_dotenv())
|
| 14 |
+
|
| 15 |
+
# Define important variables
|
| 16 |
+
embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2") # Remove embedding input parameter from functions?
|
| 17 |
+
endpoint = HuggingFaceEndpoint(
|
| 18 |
+
# ToDo: Experiment with different models here
|
| 19 |
+
repo_id="meta-llama/Llama-3.1-8B-Instruct",
|
| 20 |
+
provider="novita",
|
| 21 |
+
task="conversational",
|
| 22 |
+
max_new_tokens=512,
|
| 23 |
+
temperature=0.01,
|
| 24 |
+
top_k=30,
|
| 25 |
+
repetition_penalty=1.03,
|
| 26 |
+
)
|
| 27 |
+
llm = ChatHuggingFace(llm=endpoint)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ToDo: Experiment with different templates
|
| 31 |
+
prompt_test = ChatPromptTemplate.from_template("""<s>[INST]
|
| 32 |
+
Instruction: Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
|
| 33 |
+
|
| 34 |
+
Context: {context}
|
| 35 |
+
|
| 36 |
+
Question: {input}
|
| 37 |
+
[/INST]"""
|
| 38 |
+
|
| 39 |
+
)
|
| 40 |
+
prompt_de = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
|
| 41 |
+
|
| 42 |
+
<context>
|
| 43 |
+
{context}
|
| 44 |
+
</context>
|
| 45 |
+
|
| 46 |
+
Frage: {input}
|
| 47 |
+
"""
|
| 48 |
+
# Returns the answer in German
|
| 49 |
+
)
|
| 50 |
+
prompt_en = ChatPromptTemplate.from_template("""Answer the following question in English and solely based on the provided context:
|
| 51 |
+
|
| 52 |
+
<context>
|
| 53 |
+
{context}
|
| 54 |
+
</context>
|
| 55 |
+
|
| 56 |
+
Question: {input}
|
| 57 |
+
"""
|
| 58 |
+
# Returns the answer in English
|
| 59 |
+
)
|
| 60 |
+
# Pre-load whole vectordatabase to reduce inference during production
|
| 61 |
+
db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
|
| 62 |
+
embeddings=embeddings, allow_dangerous_deserialization=True)
|
| 63 |
+
|
| 64 |
+
def get_vectorstore(inputs, embeddings):
|
| 65 |
+
"""
|
| 66 |
+
Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
|
| 67 |
+
|
| 68 |
+
Parameters
|
| 69 |
+
----------
|
| 70 |
+
inputs : list of str
|
| 71 |
+
A list of strings specifying which vector stores to combine. Each string represents a specific
|
| 72 |
+
index or a special keyword "All". If "All" is the first entry in the list,
|
| 73 |
+
it directly return the pre-defined vectorstore for all speeches
|
| 74 |
+
|
| 75 |
+
embeddings : Embeddings
|
| 76 |
+
An instance of embeddings that will be used to load the vector stores. The specific type and
|
| 77 |
+
structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
|
| 78 |
+
|
| 79 |
+
Returns
|
| 80 |
+
-------
|
| 81 |
+
FAISS
|
| 82 |
+
A FAISS vector store that combines the specified indices into a single vector store.
|
| 83 |
+
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
# Default folder path
|
| 87 |
+
folder_path = "./src/FAISS"
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
if inputs[0] == "All" or inputs[0] is None:
|
| 91 |
+
return db_all
|
| 92 |
+
|
| 93 |
+
# Initialize empty db
|
| 94 |
+
embedding_function = embeddings
|
| 95 |
+
dimensions = len(embedding_function.embed_query("dummy"))
|
| 96 |
+
|
| 97 |
+
db = FAISS(
|
| 98 |
+
embedding_function=embedding_function,
|
| 99 |
+
index=IndexFlatL2(dimensions),
|
| 100 |
+
docstore=InMemoryDocstore(),
|
| 101 |
+
index_to_docstore_id={},
|
| 102 |
+
normalize_L2=False
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
# Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
|
| 106 |
+
for input in inputs:
|
| 107 |
+
# Ignore if user also selected All among other legislatures
|
| 108 |
+
if input == "All":
|
| 109 |
+
continue
|
| 110 |
+
# Retrieve selected index and merge vector stores
|
| 111 |
+
index = input.split(".")[0]
|
| 112 |
+
index_name = f'{index}_legislature'
|
| 113 |
+
local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
|
| 114 |
+
embeddings=embeddings, allow_dangerous_deserialization=True)
|
| 115 |
+
db.merge_from(local_db)
|
| 116 |
+
print('Successfully merged inputs')
|
| 117 |
+
return db
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def RAG(llm, prompt, db, question):
|
| 121 |
+
"""
|
| 122 |
+
Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
|
| 123 |
+
language model using a predefined template.
|
| 124 |
+
|
| 125 |
+
Parameters:
|
| 126 |
+
----------
|
| 127 |
+
llm : LanguageModel
|
| 128 |
+
An instance of the language model to be used for generating responses.
|
| 129 |
+
|
| 130 |
+
prompt : str
|
| 131 |
+
A predefined template or prompt that structures how the context and question are presented to the language model.
|
| 132 |
+
|
| 133 |
+
db : VectorStore
|
| 134 |
+
A vector store instance that supports retrieval of relevant documents based on the input question.
|
| 135 |
+
|
| 136 |
+
question : str
|
| 137 |
+
The question or query to be answered by the language model.
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
-------
|
| 141 |
+
str
|
| 142 |
+
The response generated by the language model, based on the retrieved context and provided question.
|
| 143 |
+
"""
|
| 144 |
+
# Create a document chain using the provided language model and prompt template
|
| 145 |
+
document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
|
| 146 |
+
# Convert the vector store into a retriever
|
| 147 |
+
retriever = db.as_retriever()
|
| 148 |
+
# Create a retrieval chain that integrates the retriever with the document chain
|
| 149 |
+
retrieval_chain = create_retrieval_chain(retriever, document_chain)
|
| 150 |
+
# Invoke the retrieval chain with the input question to get the final response
|
| 151 |
+
response = retrieval_chain.invoke({"input": question})
|
| 152 |
+
|
| 153 |
+
return response
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
async def chatbot(message, history, db_inputs, prompt_language, llm=llm):
|
| 157 |
+
"""
|
| 158 |
+
Generate a response from the chatbot based on the provided message, history, database inputs, prompt language, and LLM model.
|
| 159 |
+
|
| 160 |
+
Parameters:
|
| 161 |
+
-----------
|
| 162 |
+
message : str
|
| 163 |
+
The message or question to be answered by the chatbot.
|
| 164 |
+
|
| 165 |
+
history : list
|
| 166 |
+
The history of previous interactions or messages.
|
| 167 |
+
|
| 168 |
+
db_inputs : list
|
| 169 |
+
A list of strings specifying which vector stores to combine. Each string represents a specific index or a special keyword "All".
|
| 170 |
+
|
| 171 |
+
prompt_language : str
|
| 172 |
+
The language of the prompt to be used for generating the response. Should be either "DE" for German or "EN" for English.
|
| 173 |
+
|
| 174 |
+
llm : LLM, optional
|
| 175 |
+
An instance of the Language Model to be used for generating the response. Defaults to the global variable `llm`.
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
--------
|
| 179 |
+
str
|
| 180 |
+
The response generated by the chatbot.
|
| 181 |
+
"""
|
| 182 |
+
|
| 183 |
+
db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
|
| 184 |
+
|
| 185 |
+
# Select prompt based on user input
|
| 186 |
+
prompt = prompt_de if prompt_language == "DE" else prompt_en
|
| 187 |
+
raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
|
| 188 |
+
answer_key = "answer"
|
| 189 |
+
prefix = "Antwort: " if prompt_language == "DE" else "Answer: "
|
| 190 |
+
try:
|
| 191 |
+
response = raw_response[answer_key].split(prefix)[1]
|
| 192 |
+
except (KeyError, IndexError):
|
| 193 |
+
response = raw_response.get(answer_key, "Error generating response.")
|
| 194 |
+
|
| 195 |
+
return str(response) # Ensure result is cast to string
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def keyword_search(query, n=10, embeddings=embeddings, method="ss", party_filter="All"):
|
| 199 |
+
"""
|
| 200 |
+
Retrieve speech contents based on keywords using a specified method.
|
| 201 |
+
|
| 202 |
+
Parameters:
|
| 203 |
+
----------
|
| 204 |
+
db : FAISS
|
| 205 |
+
The FAISS vector store containing speech embeddings.
|
| 206 |
+
|
| 207 |
+
query : str
|
| 208 |
+
The keyword(s) to search for in the speech contents.
|
| 209 |
+
|
| 210 |
+
n : int, optional
|
| 211 |
+
The number of speech contents to retrieve (default is 10).
|
| 212 |
+
|
| 213 |
+
embeddings : Embeddings, optional
|
| 214 |
+
An instance of embeddings used for embedding queries (default is embeddings).
|
| 215 |
+
|
| 216 |
+
method : str, optional
|
| 217 |
+
The method used for retrieving speech contents. Options are 'ss' (semantic search) and 'mmr'
|
| 218 |
+
(maximal marginal relevance) (default is 'ss').
|
| 219 |
+
|
| 220 |
+
party_filter : str, optional
|
| 221 |
+
A filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
|
| 222 |
+
speeches from all parties (default is 'All').
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
-------
|
| 226 |
+
pandas.DataFrame
|
| 227 |
+
A DataFrame containing the speech contents, dates, and party affiliations.
|
| 228 |
+
|
| 229 |
+
Notes:
|
| 230 |
+
-----
|
| 231 |
+
- The `db` parameter should be a FAISS vector store containing speech embeddings.
|
| 232 |
+
- The `query` parameter specifies the keyword(s) to search for in the speech contents.
|
| 233 |
+
- The `n` parameter determines the number of speech contents to retrieve (default is 10).
|
| 234 |
+
- The `embeddings` parameter is an instance of embeddings used for embedding queries (default is embeddings).
|
| 235 |
+
- The `method` parameter specifies the method used for retrieving speech contents. Options are 'ss' (semantic search)
|
| 236 |
+
and 'mmr' (maximal marginal relevance) (default is 'ss').
|
| 237 |
+
- The `party_filter` parameter is a filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
|
| 238 |
+
speeches from all parties (default is 'All').
|
| 239 |
+
"""
|
| 240 |
+
|
| 241 |
+
db = get_vectorstore(inputs=["All"], embeddings=embeddings)
|
| 242 |
+
query_embedding = embeddings.embed_query(query)
|
| 243 |
+
|
| 244 |
+
# Maximal Marginal Relevance
|
| 245 |
+
if method == "mmr":
|
| 246 |
+
df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
|
| 247 |
+
results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
|
| 248 |
+
for doc in results:
|
| 249 |
+
party = doc[0].metadata["party"]
|
| 250 |
+
if party != party_filter and party_filter != 'All':
|
| 251 |
+
continue
|
| 252 |
+
speech_content = doc[0].page_content
|
| 253 |
+
speech_date = doc[0].metadata["date"]
|
| 254 |
+
score = round(doc[1], ndigits=2)
|
| 255 |
+
df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
|
| 256 |
+
'Date': [speech_date],
|
| 257 |
+
'Party': [party],
|
| 258 |
+
'Relevance': [score]})], ignore_index=True)
|
| 259 |
+
df_res.sort_values('Relevance', inplace=True, ascending=True)
|
| 260 |
+
return df_res
|
| 261 |
+
|
| 262 |
+
# Similarity Search
|
| 263 |
+
elif method == "ss":
|
| 264 |
+
kws_data = []
|
| 265 |
+
results = db.similarity_search_by_vector(query_embedding, k=n)
|
| 266 |
+
for doc in results:
|
| 267 |
+
party = doc.metadata["party"]
|
| 268 |
+
if party != party_filter and party_filter != 'All':
|
| 269 |
+
continue
|
| 270 |
+
speech_content = doc.page_content
|
| 271 |
+
speech_date = doc.metadata["date"]
|
| 272 |
+
speech_date = speech_date.strftime("%Y-%m-%d")
|
| 273 |
+
kws_entry = {'Speech Content': speech_content,
|
| 274 |
+
'Date': speech_date,
|
| 275 |
+
'Party': party}
|
| 276 |
+
kws_data.append(kws_entry)
|
| 277 |
+
|
| 278 |
+
df_res = pd.DataFrame(kws_data)
|
| 279 |
+
return df_res
|
| 280 |
+
|
| 281 |
+
else:
|
| 282 |
+
raise ValueError("Method must be either 'ss' or 'mmr'")
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
|