TomData commited on
Commit
83c7ddf
·
1 Parent(s): 50a5a90

project upload

Browse files
Files changed (7) hide show
  1. .gitattributes +4 -34
  2. .gitignore +4 -0
  3. Home.py +162 -0
  4. README.md +9 -7
  5. requirements.txt +57 -0
  6. src/FAISS/FAISS.ipynb +390 -0
  7. src/chatbot.py +286 -0
.gitattributes CHANGED
@@ -1,35 +1,5 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
  *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.pkl filter=lfs diff=lfs merge=lfs -text
2
+ *.csv filter=lfs diff=lfs merge=lfs -text
3
+ *.faiss filter=lfs diff=lfs merge=lfs -text
4
+ src/FAISS/speeches_1949_09_12.faiss filter=lfs diff=lfs merge=lfs -text
5
+ src/FAISS/speeches_1949_09_12.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__
2
+ hf_upload.py
3
+ .env
4
+ .mypy_cache
Home.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download
2
+ # Download legislature vectordatabase
3
+ REPO_ID = "TomData/speeches-of-the-german-parliament"
4
+ LOCAL_DIR = "src/FAISS"
5
+ snapshot_download(repo_id=REPO_ID, local_dir=LOCAL_DIR, repo_type="dataset")
6
+
7
+ import gradio as gr
8
+ #from gradio_calendar import Calendar
9
+ #from datetime import datetime
10
+ from src.chatbot import chatbot, keyword_search
11
+
12
+ # Only required when running locally
13
+ # import os
14
+ # from dotenv import load_dotenv
15
+ # from huggingface_hub import login
16
+ # load_dotenv(dotenv_path=".env")
17
+ # login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN")) # Your token here
18
+
19
+
20
+ # Define important variables
21
+ legislature_periods = [
22
+ "All",
23
+ "20. Legislaturperiode",
24
+ "19. Legislaturperiode",
25
+ "18. Legislaturperiode",
26
+ "17. Legislaturperiode",
27
+ "16. Legislaturperiode",
28
+ "15. Legislaturperiode",
29
+ "14. Legislaturperiode",
30
+ "13. Legislaturperiode",
31
+ "12. Legislaturperiode",
32
+ "11. Legislaturperiode",
33
+ "10. Legislaturperiode",
34
+ "9. Legislaturperiode",
35
+ "8. Legislaturperiode",
36
+ "7. Legislaturperiode",
37
+ "6. Legislaturperiode",
38
+ "5. Legislaturperiode",
39
+ "4. Legislaturperiode",
40
+ "3. Legislaturperiode",
41
+ "2. Legislaturperiode",
42
+ "1. Legislaturperiode"
43
+ ]
44
+
45
+ partys = ['All','CDU/CSU','SPD','AfD','Grüne','FDP','DIE LINKE.','GB/BHE','DRP', 'WAV', 'NR', 'BP', 'FU', 'SSW', 'KPD', 'DA', 'FVP','DP','Z', 'PDS','Fraktionslos','not found', 'Gast']
46
+
47
+
48
+ # Define Gradio App Layout
49
+ with gr.Blocks() as App:
50
+ with gr.Tab("ChatBot"):
51
+ with gr.Blocks(fill_height=True):
52
+ with gr.Accordion(open=False, label="Filter"):
53
+ # Apply RAG using chatbot function from file chatbot.py
54
+ db_inputs = gr.Dropdown(choices=legislature_periods, value="All", multiselect=True, label="Legislature", info="Select a combination of legislatures as basis for the chatbot's replies", show_label=True)
55
+ prompt_language = gr.Dropdown(choices=["DE", "EN"], value="DE",label="Language", info="Choose output language", multiselect=False)
56
+
57
+ gr.ChatInterface(chatbot,
58
+ title="PoliticsToYou",
59
+ description= "Ask anything about your favorite political topic from any legislature period",
60
+ examples=[
61
+ ["Wie steht die CDU zur Cannabislegalisierung?", "All", "DE"],
62
+ ["Wie steht die FDP zur Rente?", "All", "DE"],
63
+ ["Was sagten die Parteien in der ersten Legislaturperiode über die nazi Vergangenheit?", "1. Legislaturperiode", "DE"],
64
+ ["Wie wird die Ehe für alle diskutiert?", "18. Legislaturperiode", "DE"],
65
+ ["How is the GDR perceived?", "11. Legislaturperiode", "EN"]
66
+ ],
67
+ cache_examples=True, #true increases loading time
68
+ additional_inputs = [db_inputs, prompt_language],
69
+ )
70
+
71
+ with gr.Tab("KeywordSearch"):
72
+
73
+ with gr.Blocks() as Block:
74
+ # Keyword Input
75
+ keyword_box = gr.Textbox(label='keyword')
76
+
77
+ # Additional Input (hidden)
78
+ with gr.Accordion('Filter', open=False):
79
+ # Row orientation
80
+ with gr.Row() as additional_input:
81
+ n_slider = gr.Slider(label="Number of Results",info="Other filters reduces the returned results", minimum=1, maximum=100, step=1, value=10)
82
+ party_dopdown = gr.Dropdown(value='All', choices=partys, label='Party')
83
+ # ToDo: Add date or legislature filter as input
84
+ #start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
85
+ #end_date = Calendar(value=datetime.today().strftime('%Y-%m-%d'), type="datetime", label="Select end date", info="Click the calendar icon to bring up the calendar.", interactive=True)
86
+
87
+ search_btn = gr.Button('Search')
88
+
89
+ with gr.Column(visible=False) as output_col:
90
+ results_df = gr.Dataframe(label='Results', interactive=False)
91
+
92
+ # Download results from keyword search
93
+ with gr.Accordion('Would you like to download your results?', open=False) as download_row:
94
+ with gr.Row():
95
+ ftype_dropdown = gr.Dropdown(choices=["csv","excel","json"], label="Format")
96
+ export_btn = gr.Button('Export')
97
+ file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
98
+
99
+ # Keyword Search on click
100
+ def search(keyword, n, party): # ToDo: Include party and timedate
101
+ return {
102
+ output_col: gr.Column(visible=True),
103
+ results_df: keyword_search(query=keyword, n=n, party_filter=party),
104
+ }
105
+
106
+ search_btn.click(
107
+ fn=search,
108
+ inputs=[keyword_box, n_slider, party_dopdown],
109
+ outputs=[output_col, results_df],
110
+ )
111
+
112
+ # Export data to a downloadable format
113
+ def export(df, keyword, ftype=None):
114
+ if ftype == "csv":
115
+ file = f'{keyword}.csv'
116
+ df.to_csv(file, index = False)
117
+ return gr.File(value=file,visible=True)
118
+ elif ftype == "json":
119
+ file = f'{keyword}.json'
120
+ df.to_json(file, index = True)
121
+ return gr.File(value=file,visible=True)
122
+ else:
123
+ file = f'{keyword}.xlsx'
124
+ df.to_excel(file, index = True)
125
+ return gr.File(value=file,visible=True)
126
+
127
+ export_btn.click(
128
+ fn=export,
129
+ inputs=[results_df, keyword_box, ftype_dropdown],
130
+ outputs=[file],
131
+ )
132
+
133
+ with gr.Tab("About"):
134
+ gr.Markdown("""
135
+ <h2>Welcome to <strong>PoliticsToYou</strong> - your playground for investigating the heart of politics in Germany</h2>
136
+ <ul>
137
+ <p>Would you like to gain insights into political debates or reveal party positions on specific topics from any legislature?</p>
138
+ <br>
139
+ <p>You can use the ChatBot to ask all your questions or search for related speech content in the Keyword Search section.</p>
140
+ </ul>
141
+ <p>Looking forward to your feedback!</p>
142
+
143
+ <h3>Further improvements & Ideas:</h3>
144
+ <ul>
145
+ <li>Experiment with different LLMs and prompt templates</li>
146
+ <li>Include chat history</li>
147
+ <li>Add a legislature filter to KeywordSearch</li>
148
+ <li>Exclude short document splits</li>
149
+ <li>Improve inference time</li>
150
+ <li>Update the database every month with the latest content</li>
151
+ <li>Expand the scope to party manifestos and different countries</li>
152
+ </ul>
153
+
154
+ <p>Big thank you to the OpenDiscourse team for creating the underlying speeches corpus. Check out their website <a href="https://opendiscourse.de/">here</a>.</p>
155
+
156
+ """
157
+ )
158
+
159
+ if __name__ == "__main__":
160
+ App.launch(share=False) # true not supported on hf spaces
161
+
162
+
README.md CHANGED
@@ -1,12 +1,14 @@
1
  ---
2
- title: Test
3
- emoji: 👀
4
- colorFrom: gray
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.6.0
8
- app_file: app.py
 
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: PoliticsToYou
3
+ emoji: 🏢
4
+ colorFrom: pink
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.26.0
8
+ python_version: 3.11
9
+ app_file: Home.py
10
  pinned: false
11
+ short_description: Explore speeches from the German Bundestag since 1949.
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
requirements.txt ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Network and Async
2
+ aiohttp>=3.11.0
3
+ aiosignal>=1.3.1
4
+ annotated-types>=0.7.0
5
+ anyio>=4.8.0
6
+ attrs>=25.1.0
7
+ certifi>=2025.1.0
8
+ charset-normalizer>=3.4.0
9
+
10
+ numpy==1.26.4
11
+ pandas<3
12
+ Pillow<11
13
+ MarkupSafe<3
14
+ pydantic==2.10.6
15
+
16
+ # Data Science and Processing
17
+ # numpy>=2.4.1
18
+ # pandas>=2.3.3
19
+ # Pillow>=12.1.0
20
+ openpyxl>=3.1.5
21
+ et-xmlfile>=2.0.0
22
+ mpmath>=1.3.0
23
+
24
+ # Fixed Core Frameworks
25
+ langchain>=1.0.0
26
+ langchain-classic>=1.0.0
27
+ langchain-community>=0.3.0
28
+ langchain-core>=1.0.0
29
+ langchain-huggingface>=1.0.0
30
+ langchain-text-splitters>=1.0.0
31
+
32
+ # Vector Search and AI Infrastructure
33
+ faiss-cpu>=1.13.2
34
+ huggingface-hub>=0.36.0
35
+ fsspec>=2025.1.0
36
+ filelock>=3.16.0
37
+ sentence-transformers>=3.3.0
38
+
39
+ # Serialization and Utilities
40
+ orjson>=3.10.12
41
+ jsonpatch>=1.33
42
+ jsonpointer>=3.0.0
43
+ marshmallow>=3.23.0
44
+ joblib>=1.4.2
45
+ packaging>=24.2
46
+ idna>=3.10
47
+ typing-extensions>=4.12.0
48
+
49
+ # Web and UI
50
+ # Jinja2>=3.1.5
51
+ # MarkupSafe>=3.0.0
52
+
53
+ # System and Performance
54
+ greenlet>=3.1.1
55
+ networkx>=3.4.2
56
+ frozenlist>=1.5.0
57
+ dataclasses-json>=0.6.7
src/FAISS/FAISS.ipynb ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import psycopg2\n",
11
+ "\n",
12
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
13
+ "from langchain_community.document_loaders import DataFrameLoader\n",
14
+ "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
15
+ "from langchain_community.vectorstores import FAISS\n",
16
+ "from datetime import datetime\n",
17
+ "\n"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "markdown",
22
+ "metadata": {},
23
+ "source": [
24
+ "### Retrieve Speeches"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "# db_connection -----------------------------------------------------------\n",
34
+ "con_details = {\n",
35
+ " \"host\" : \"localhost\",\n",
36
+ " \"database\" : \"next\",\n",
37
+ " \"user\" : \"postgres\",\n",
38
+ " \"password\" : \"postgres\",\n",
39
+ " \"port\" : \"5433\"\n",
40
+ "}\n",
41
+ "con = psycopg2.connect(**con_details)\n",
42
+ "\n",
43
+ "# get data tables ---------------------------------------------------------\n",
44
+ "df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n",
45
+ " FROM open_discourse.speeches AS s\n",
46
+ " INNER JOIN open_discourse.factions AS f ON\n",
47
+ " s.faction_id = f.id;\"\"\", con)"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "metadata": {},
53
+ "source": [
54
+ "### Process speeches"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "print(set(df['party'].to_list()))"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": null,
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "# Removing keys from interruptions of a speech\n",
73
+ "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) \n",
74
+ "df['date'] = pd.to_datetime(df['date'])\n",
75
+ "df"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "data": {
85
+ "text/html": [
86
+ "<div>\n",
87
+ "<style scoped>\n",
88
+ " .dataframe tbody tr th:only-of-type {\n",
89
+ " vertical-align: middle;\n",
90
+ " }\n",
91
+ "\n",
92
+ " .dataframe tbody tr th {\n",
93
+ " vertical-align: top;\n",
94
+ " }\n",
95
+ "\n",
96
+ " .dataframe thead th {\n",
97
+ " text-align: right;\n",
98
+ " }\n",
99
+ "</style>\n",
100
+ "<table border=\"1\" class=\"dataframe\">\n",
101
+ " <thead>\n",
102
+ " <tr style=\"text-align: right;\">\n",
103
+ " <th></th>\n",
104
+ " <th>id</th>\n",
105
+ " <th>speech_content</th>\n",
106
+ " <th>date</th>\n",
107
+ " <th>party</th>\n",
108
+ " </tr>\n",
109
+ " </thead>\n",
110
+ " <tbody>\n",
111
+ " <tr>\n",
112
+ " <th>0</th>\n",
113
+ " <td>0</td>\n",
114
+ " <td>Meine Damen und Herren! Ich eröffne die 2. Sit...</td>\n",
115
+ " <td>1949-09-12</td>\n",
116
+ " <td>not found</td>\n",
117
+ " </tr>\n",
118
+ " <tr>\n",
119
+ " <th>1</th>\n",
120
+ " <td>1</td>\n",
121
+ " <td>Der Bundesrat ist versammelt, Herr Präsident.\\n</td>\n",
122
+ " <td>1949-09-12</td>\n",
123
+ " <td>not found</td>\n",
124
+ " </tr>\n",
125
+ " <tr>\n",
126
+ " <th>2</th>\n",
127
+ " <td>2</td>\n",
128
+ " <td>Ich danke für diese Erklärung. Ich stelle dami...</td>\n",
129
+ " <td>1949-09-12</td>\n",
130
+ " <td>not found</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>3</th>\n",
134
+ " <td>3</td>\n",
135
+ " <td>Ja, ich habe den Wunsch.\\n</td>\n",
136
+ " <td>1949-09-12</td>\n",
137
+ " <td>not found</td>\n",
138
+ " </tr>\n",
139
+ " <tr>\n",
140
+ " <th>4</th>\n",
141
+ " <td>4</td>\n",
142
+ " <td>Ich erteile dem Herrn Bundespräsidenten das Wo...</td>\n",
143
+ " <td>1949-09-12</td>\n",
144
+ " <td>not found</td>\n",
145
+ " </tr>\n",
146
+ " <tr>\n",
147
+ " <th>...</th>\n",
148
+ " <td>...</td>\n",
149
+ " <td>...</td>\n",
150
+ " <td>...</td>\n",
151
+ " <td>...</td>\n",
152
+ " </tr>\n",
153
+ " <tr>\n",
154
+ " <th>930955</th>\n",
155
+ " <td>1084268</td>\n",
156
+ " <td>\\n\\nWir sind zwar Kollegen.</td>\n",
157
+ " <td>2022-12-16</td>\n",
158
+ " <td>not found</td>\n",
159
+ " </tr>\n",
160
+ " <tr>\n",
161
+ " <th>930956</th>\n",
162
+ " <td>1084269</td>\n",
163
+ " <td>\\n\\nLiebe, sehr geehrte Frau Präsidentin!</td>\n",
164
+ " <td>2022-12-16</td>\n",
165
+ " <td>CDU/CSU</td>\n",
166
+ " </tr>\n",
167
+ " <tr>\n",
168
+ " <th>930957</th>\n",
169
+ " <td>1084270</td>\n",
170
+ " <td>\\n\\nVielen Dank.</td>\n",
171
+ " <td>2022-12-16</td>\n",
172
+ " <td>not found</td>\n",
173
+ " </tr>\n",
174
+ " <tr>\n",
175
+ " <th>930958</th>\n",
176
+ " <td>1084272</td>\n",
177
+ " <td>\\n\\nDen Abschluss dieser Aktuellen Stunde bild...</td>\n",
178
+ " <td>2022-12-16</td>\n",
179
+ " <td>not found</td>\n",
180
+ " </tr>\n",
181
+ " <tr>\n",
182
+ " <th>930959</th>\n",
183
+ " <td>1084273</td>\n",
184
+ " <td>\\n\\nSehr geehrte Frau Präsidentin! Werte Kolle...</td>\n",
185
+ " <td>2022-12-16</td>\n",
186
+ " <td>SPD</td>\n",
187
+ " </tr>\n",
188
+ " </tbody>\n",
189
+ "</table>\n",
190
+ "<p>930960 rows × 4 columns</p>\n",
191
+ "</div>"
192
+ ],
193
+ "text/plain": [
194
+ " id speech_content date \\\n",
195
+ "0 0 Meine Damen und Herren! Ich eröffne die 2. Sit... 1949-09-12 \n",
196
+ "1 1 Der Bundesrat ist versammelt, Herr Präsident.\\n 1949-09-12 \n",
197
+ "2 2 Ich danke für diese Erklärung. Ich stelle dami... 1949-09-12 \n",
198
+ "3 3 Ja, ich habe den Wunsch.\\n 1949-09-12 \n",
199
+ "4 4 Ich erteile dem Herrn Bundespräsidenten das Wo... 1949-09-12 \n",
200
+ "... ... ... ... \n",
201
+ "930955 1084268 \\n\\nWir sind zwar Kollegen. 2022-12-16 \n",
202
+ "930956 1084269 \\n\\nLiebe, sehr geehrte Frau Präsidentin! 2022-12-16 \n",
203
+ "930957 1084270 \\n\\nVielen Dank. 2022-12-16 \n",
204
+ "930958 1084272 \\n\\nDen Abschluss dieser Aktuellen Stunde bild... 2022-12-16 \n",
205
+ "930959 1084273 \\n\\nSehr geehrte Frau Präsidentin! Werte Kolle... 2022-12-16 \n",
206
+ "\n",
207
+ " party \n",
208
+ "0 not found \n",
209
+ "1 not found \n",
210
+ "2 not found \n",
211
+ "3 not found \n",
212
+ "4 not found \n",
213
+ "... ... \n",
214
+ "930955 not found \n",
215
+ "930956 CDU/CSU \n",
216
+ "930957 not found \n",
217
+ "930958 not found \n",
218
+ "930959 SPD \n",
219
+ "\n",
220
+ "[930960 rows x 4 columns]"
221
+ ]
222
+ },
223
+ "execution_count": 3,
224
+ "metadata": {},
225
+ "output_type": "execute_result"
226
+ }
227
+ ],
228
+ "source": [
229
+ "# Convert to proper time format\n",
230
+ "df['date'] = pd.to_datetime(df['date'])\n"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": 27,
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": [
239
+ "def split_documents(df, min_chunk_size=100):\n",
240
+ " \"\"\"\n",
241
+ " Load documents from a DataFrame, split them into smaller chunks for vector storage and remove chunks of small size.\n",
242
+ "\n",
243
+ " Parameters\n",
244
+ " ----------\n",
245
+ " df : pandas.DataFrame\n",
246
+ " A DataFrame containing the documents to be processed, with a column named 'speech_content'.\n",
247
+ " min_chunk_size : int, optional\n",
248
+ " Minimum number of characters a chunk must have to be included in the result. Default is 100.\n",
249
+ "\n",
250
+ " Returns\n",
251
+ " -------\n",
252
+ " list\n",
253
+ " A list of split document chunks ready for further processing or vectorization.\n",
254
+ " \"\"\"\n",
255
+ " # Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load\n",
256
+ " loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')\n",
257
+ " # Load the data from the DataFrame into a suitable format for processing\n",
258
+ " data = loader.load()\n",
259
+ " # Initialize a RecursiveCharacterTextSplitter to split the text into chunks\n",
260
+ " splitter = RecursiveCharacterTextSplitter(\n",
261
+ " chunk_size=1024,\n",
262
+ " chunk_overlap=32,\n",
263
+ " length_function=len,\n",
264
+ " is_separator_regex=False,\n",
265
+ " )\n",
266
+ " # Split the loaded data into smaller chunks using the splitter\n",
267
+ " documents = splitter.split_documents(documents=data)\n",
268
+ " # Discard small chunks below the threshold\n",
269
+ " cleaned_documents = [doc for doc in documents if len(doc.page_content) >= min_chunk_size]\n",
270
+ "\n",
271
+ " return cleaned_documents"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": null,
277
+ "metadata": {},
278
+ "outputs": [
279
+ {
280
+ "name": "stderr",
281
+ "output_type": "stream",
282
+ "text": [
283
+ "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
284
+ " warnings.warn(\n"
285
+ ]
286
+ },
287
+ {
288
+ "name": "stdout",
289
+ "output_type": "stream",
290
+ "text": [
291
+ "Sucessfully created vector store for 1. legislature\n",
292
+ "Sucessfully created vector store for 2. legislature\n",
293
+ "Sucessfully created vector store for 3. legislature\n",
294
+ "Sucessfully created vector store for 4. legislature\n",
295
+ "Sucessfully created vector store for 5. legislature\n",
296
+ "Sucessfully created vector store for 6. legislature\n",
297
+ "Sucessfully created vector store for 7. legislature\n",
298
+ "Sucessfully created vector store for 8. legislature\n",
299
+ "Sucessfully created vector store for 9. legislature\n",
300
+ "Sucessfully created vector store for 10. legislature\n",
301
+ "Sucessfully created vector store for 11. legislature\n",
302
+ "Sucessfully created vector store for 12. legislature\n",
303
+ "Sucessfully created vector store for 13. legislature\n",
304
+ "Sucessfully created vector store for 14. legislature\n",
305
+ "Sucessfully created vector store for 15. legislature\n",
306
+ "Sucessfully created vector store for 16. legislature\n",
307
+ "Sucessfully created vector store for 17. legislature\n",
308
+ "Sucessfully created vector store for 18. legislature\n",
309
+ "Sucessfully created vector store for 19. legislature\n",
310
+ "Sucessfully created vector store for 20. legislature\n"
311
+ ]
312
+ }
313
+ ],
314
+ "source": [
315
+ "# Define starting dates of legislature periods\n",
316
+ "dates = [\"1953-10-06\", \"1957-10-16\", \"1961-10-17\", \"1965-10-19\", \"1969-10-20\", \"1972-12-13\", \"1976-12-14\", \"1980-11-04\", \"1983-03-29\", \"1987-02-18\",\"1990-12-20\", \"1994-11-10\", \"1998-10-26\", \"2002-10-17\", \"2005-10-18\", \"2009-10-27\", \"2013-10-22\",\"2017-10-24\",\"2021-10-26\", None]\n",
317
+ "# Load sentence transformer \n",
318
+ "embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
319
+ "\n",
320
+ "# Create vector store for all speaches\n",
321
+ "# Split text into documents for vectorstore\n",
322
+ "documents = split_documents(df)\n",
323
+ "# Create and save faiss vectorstorage\n",
324
+ "index_name = 'speeches_1949_09_12'\n",
325
+ "db = FAISS.from_documents(documents, embeddings)\n",
326
+ "db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
327
+ "print(\"Sucessfully created vector store for all legislature\")\n",
328
+ "\n",
329
+ "# Create vector store for each legislature\n",
330
+ "# loop parameters\n",
331
+ "period = 1\n",
332
+ "previous_date = None\n",
333
+ "\n",
334
+ "# Iterate over all date to split by legislature getting vector stores for each period\n",
335
+ "for date in dates:\n",
336
+ " if previous_date is None:\n",
337
+ " legislature_df = df.loc[df['date'] < datetime.strptime(date, \"%Y-%m-%d\")]\n",
338
+ " elif date is None:\n",
339
+ " legislature_df = df.loc[df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")]\n",
340
+ " else:\n",
341
+ " legislature_df = df.loc[(df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")) & (df['date'] < datetime.strptime(date, \"%Y-%m-%d\"))]\n",
342
+ "\n",
343
+ " \n",
344
+ " # Split text into documents for vectorstore\n",
345
+ " documents = split_documents(legislature_df)\n",
346
+ "\n",
347
+ " # Create and save faiss vectorstorage\n",
348
+ " index_name = f'{period}_legislature'\n",
349
+ " db = FAISS.from_documents(documents, embeddings)\n",
350
+ " db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
351
+ " print(f\"Sucessfully created vector store for {period}. legislature\")\n",
352
+ "\n",
353
+ " # Change loop parameters for next iteration\n",
354
+ " period += 1\n",
355
+ " previous_date = date\n",
356
+ "\n",
357
+ "\n",
358
+ " \n"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "markdown",
363
+ "metadata": {},
364
+ "source": [
365
+ "This data has been uploaded to: https://huggingface.co/datasets/TomData/speeches-of-the-german-parliament"
366
+ ]
367
+ }
368
+ ],
369
+ "metadata": {
370
+ "kernelspec": {
371
+ "display_name": "Python 3",
372
+ "language": "python",
373
+ "name": "python3"
374
+ },
375
+ "language_info": {
376
+ "codemirror_mode": {
377
+ "name": "ipython",
378
+ "version": 3
379
+ },
380
+ "file_extension": ".py",
381
+ "mimetype": "text/x-python",
382
+ "name": "python",
383
+ "nbconvert_exporter": "python",
384
+ "pygments_lexer": "ipython3",
385
+ "version": "3.11.4"
386
+ }
387
+ },
388
+ "nbformat": 4,
389
+ "nbformat_minor": 2
390
+ }
src/chatbot.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import ChatPromptTemplate
2
+ from langchain_huggingface import HuggingFaceEndpoint
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain_huggingface import ChatHuggingFace
6
+ from langchain_classic.chains.combine_documents import create_stuff_documents_chain
7
+ from langchain_classic.chains import create_retrieval_chain
8
+ from langchain_community.docstore.in_memory import InMemoryDocstore
9
+ from faiss import IndexFlatL2
10
+ import pandas as pd
11
+ # Load environmental variables from .env-file
12
+ from dotenv import load_dotenv, find_dotenv
13
+ load_dotenv(find_dotenv())
14
+
15
+ # Define important variables
16
+ embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2") # Remove embedding input parameter from functions?
17
+ endpoint = HuggingFaceEndpoint(
18
+ # ToDo: Experiment with different models here
19
+ repo_id="meta-llama/Llama-3.1-8B-Instruct",
20
+ provider="novita",
21
+ task="conversational",
22
+ max_new_tokens=512,
23
+ temperature=0.01,
24
+ top_k=30,
25
+ repetition_penalty=1.03,
26
+ )
27
+ llm = ChatHuggingFace(llm=endpoint)
28
+
29
+
30
+ # ToDo: Experiment with different templates
31
+ prompt_test = ChatPromptTemplate.from_template("""<s>[INST]
32
+ Instruction: Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
33
+
34
+ Context: {context}
35
+
36
+ Question: {input}
37
+ [/INST]"""
38
+
39
+ )
40
+ prompt_de = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
41
+
42
+ <context>
43
+ {context}
44
+ </context>
45
+
46
+ Frage: {input}
47
+ """
48
+ # Returns the answer in German
49
+ )
50
+ prompt_en = ChatPromptTemplate.from_template("""Answer the following question in English and solely based on the provided context:
51
+
52
+ <context>
53
+ {context}
54
+ </context>
55
+
56
+ Question: {input}
57
+ """
58
+ # Returns the answer in English
59
+ )
60
+ # Pre-load whole vectordatabase to reduce inference during production
61
+ db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
62
+ embeddings=embeddings, allow_dangerous_deserialization=True)
63
+
64
+ def get_vectorstore(inputs, embeddings):
65
+ """
66
+ Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
67
+
68
+ Parameters
69
+ ----------
70
+ inputs : list of str
71
+ A list of strings specifying which vector stores to combine. Each string represents a specific
72
+ index or a special keyword "All". If "All" is the first entry in the list,
73
+ it directly return the pre-defined vectorstore for all speeches
74
+
75
+ embeddings : Embeddings
76
+ An instance of embeddings that will be used to load the vector stores. The specific type and
77
+ structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
78
+
79
+ Returns
80
+ -------
81
+ FAISS
82
+ A FAISS vector store that combines the specified indices into a single vector store.
83
+
84
+ """
85
+
86
+ # Default folder path
87
+ folder_path = "./src/FAISS"
88
+
89
+
90
+ if inputs[0] == "All" or inputs[0] is None:
91
+ return db_all
92
+
93
+ # Initialize empty db
94
+ embedding_function = embeddings
95
+ dimensions = len(embedding_function.embed_query("dummy"))
96
+
97
+ db = FAISS(
98
+ embedding_function=embedding_function,
99
+ index=IndexFlatL2(dimensions),
100
+ docstore=InMemoryDocstore(),
101
+ index_to_docstore_id={},
102
+ normalize_L2=False
103
+ )
104
+
105
+ # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
106
+ for input in inputs:
107
+ # Ignore if user also selected All among other legislatures
108
+ if input == "All":
109
+ continue
110
+ # Retrieve selected index and merge vector stores
111
+ index = input.split(".")[0]
112
+ index_name = f'{index}_legislature'
113
+ local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
114
+ embeddings=embeddings, allow_dangerous_deserialization=True)
115
+ db.merge_from(local_db)
116
+ print('Successfully merged inputs')
117
+ return db
118
+
119
+
120
+ def RAG(llm, prompt, db, question):
121
+ """
122
+ Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
123
+ language model using a predefined template.
124
+
125
+ Parameters:
126
+ ----------
127
+ llm : LanguageModel
128
+ An instance of the language model to be used for generating responses.
129
+
130
+ prompt : str
131
+ A predefined template or prompt that structures how the context and question are presented to the language model.
132
+
133
+ db : VectorStore
134
+ A vector store instance that supports retrieval of relevant documents based on the input question.
135
+
136
+ question : str
137
+ The question or query to be answered by the language model.
138
+
139
+ Returns:
140
+ -------
141
+ str
142
+ The response generated by the language model, based on the retrieved context and provided question.
143
+ """
144
+ # Create a document chain using the provided language model and prompt template
145
+ document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
146
+ # Convert the vector store into a retriever
147
+ retriever = db.as_retriever()
148
+ # Create a retrieval chain that integrates the retriever with the document chain
149
+ retrieval_chain = create_retrieval_chain(retriever, document_chain)
150
+ # Invoke the retrieval chain with the input question to get the final response
151
+ response = retrieval_chain.invoke({"input": question})
152
+
153
+ return response
154
+
155
+
156
+ async def chatbot(message, history, db_inputs, prompt_language, llm=llm):
157
+ """
158
+ Generate a response from the chatbot based on the provided message, history, database inputs, prompt language, and LLM model.
159
+
160
+ Parameters:
161
+ -----------
162
+ message : str
163
+ The message or question to be answered by the chatbot.
164
+
165
+ history : list
166
+ The history of previous interactions or messages.
167
+
168
+ db_inputs : list
169
+ A list of strings specifying which vector stores to combine. Each string represents a specific index or a special keyword "All".
170
+
171
+ prompt_language : str
172
+ The language of the prompt to be used for generating the response. Should be either "DE" for German or "EN" for English.
173
+
174
+ llm : LLM, optional
175
+ An instance of the Language Model to be used for generating the response. Defaults to the global variable `llm`.
176
+
177
+ Returns:
178
+ --------
179
+ str
180
+ The response generated by the chatbot.
181
+ """
182
+
183
+ db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
184
+
185
+ # Select prompt based on user input
186
+ prompt = prompt_de if prompt_language == "DE" else prompt_en
187
+ raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
188
+ answer_key = "answer"
189
+ prefix = "Antwort: " if prompt_language == "DE" else "Answer: "
190
+ try:
191
+ response = raw_response[answer_key].split(prefix)[1]
192
+ except (KeyError, IndexError):
193
+ response = raw_response.get(answer_key, "Error generating response.")
194
+
195
+ return str(response) # Ensure result is cast to string
196
+
197
+
198
+ def keyword_search(query, n=10, embeddings=embeddings, method="ss", party_filter="All"):
199
+ """
200
+ Retrieve speech contents based on keywords using a specified method.
201
+
202
+ Parameters:
203
+ ----------
204
+ db : FAISS
205
+ The FAISS vector store containing speech embeddings.
206
+
207
+ query : str
208
+ The keyword(s) to search for in the speech contents.
209
+
210
+ n : int, optional
211
+ The number of speech contents to retrieve (default is 10).
212
+
213
+ embeddings : Embeddings, optional
214
+ An instance of embeddings used for embedding queries (default is embeddings).
215
+
216
+ method : str, optional
217
+ The method used for retrieving speech contents. Options are 'ss' (semantic search) and 'mmr'
218
+ (maximal marginal relevance) (default is 'ss').
219
+
220
+ party_filter : str, optional
221
+ A filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
222
+ speeches from all parties (default is 'All').
223
+
224
+ Returns:
225
+ -------
226
+ pandas.DataFrame
227
+ A DataFrame containing the speech contents, dates, and party affiliations.
228
+
229
+ Notes:
230
+ -----
231
+ - The `db` parameter should be a FAISS vector store containing speech embeddings.
232
+ - The `query` parameter specifies the keyword(s) to search for in the speech contents.
233
+ - The `n` parameter determines the number of speech contents to retrieve (default is 10).
234
+ - The `embeddings` parameter is an instance of embeddings used for embedding queries (default is embeddings).
235
+ - The `method` parameter specifies the method used for retrieving speech contents. Options are 'ss' (semantic search)
236
+ and 'mmr' (maximal marginal relevance) (default is 'ss').
237
+ - The `party_filter` parameter is a filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
238
+ speeches from all parties (default is 'All').
239
+ """
240
+
241
+ db = get_vectorstore(inputs=["All"], embeddings=embeddings)
242
+ query_embedding = embeddings.embed_query(query)
243
+
244
+ # Maximal Marginal Relevance
245
+ if method == "mmr":
246
+ df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
247
+ results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
248
+ for doc in results:
249
+ party = doc[0].metadata["party"]
250
+ if party != party_filter and party_filter != 'All':
251
+ continue
252
+ speech_content = doc[0].page_content
253
+ speech_date = doc[0].metadata["date"]
254
+ score = round(doc[1], ndigits=2)
255
+ df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
256
+ 'Date': [speech_date],
257
+ 'Party': [party],
258
+ 'Relevance': [score]})], ignore_index=True)
259
+ df_res.sort_values('Relevance', inplace=True, ascending=True)
260
+ return df_res
261
+
262
+ # Similarity Search
263
+ elif method == "ss":
264
+ kws_data = []
265
+ results = db.similarity_search_by_vector(query_embedding, k=n)
266
+ for doc in results:
267
+ party = doc.metadata["party"]
268
+ if party != party_filter and party_filter != 'All':
269
+ continue
270
+ speech_content = doc.page_content
271
+ speech_date = doc.metadata["date"]
272
+ speech_date = speech_date.strftime("%Y-%m-%d")
273
+ kws_entry = {'Speech Content': speech_content,
274
+ 'Date': speech_date,
275
+ 'Party': party}
276
+ kws_data.append(kws_entry)
277
+
278
+ df_res = pd.DataFrame(kws_data)
279
+ return df_res
280
+
281
+ else:
282
+ raise ValueError("Method must be either 'ss' or 'mmr'")
283
+
284
+
285
+
286
+