Spaces:

TomData
/

PoliticsToYou

Sleeping

App Files Files Community

TomData commited on Feb 26

Commit

83c7ddf

1 Parent(s): 50a5a90

project upload

Browse files

Files changed (7) hide show

.gitattributes +4 -34
.gitignore +4 -0
Home.py +162 -0
README.md +9 -7
requirements.txt +57 -0
src/FAISS/FAISS.ipynb +390 -0
src/chatbot.py +286 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,5 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pkl filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
+*.faiss filter=lfs diff=lfs merge=lfs -text
+src/FAISS/speeches_1949_09_12.faiss filter=lfs diff=lfs merge=lfs -text
+src/FAISS/speeches_1949_09_12.pkl filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__
+hf_upload.py
+.env
+.mypy_cache

Home.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from huggingface_hub import snapshot_download
+# Download legislature vectordatabase
+REPO_ID = "TomData/speeches-of-the-german-parliament"
+LOCAL_DIR = "src/FAISS"
+snapshot_download(repo_id=REPO_ID, local_dir=LOCAL_DIR, repo_type="dataset")
+import gradio as gr
+#from gradio_calendar import Calendar
+#from datetime import datetime
+from src.chatbot import chatbot, keyword_search
+# Only required when running locally
+# import os
+# from dotenv import load_dotenv
+# from huggingface_hub import login
+# load_dotenv(dotenv_path=".env")
+# login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN")) # Your token here
+# Define important variables
+legislature_periods = [
+    "All",
+    "20. Legislaturperiode",
+    "19. Legislaturperiode",
+    "18. Legislaturperiode",
+    "17. Legislaturperiode",
+    "16. Legislaturperiode",
+    "15. Legislaturperiode",
+    "14. Legislaturperiode",
+    "13. Legislaturperiode",
+    "12. Legislaturperiode",
+    "11. Legislaturperiode",
+    "10. Legislaturperiode",
+    "9. Legislaturperiode",
+    "8. Legislaturperiode",
+    "7. Legislaturperiode",
+    "6. Legislaturperiode",
+    "5. Legislaturperiode",
+    "4. Legislaturperiode",
+    "3. Legislaturperiode",
+    "2. Legislaturperiode",
+    "1. Legislaturperiode"
+]
+partys = ['All','CDU/CSU','SPD','AfD','Grüne','FDP','DIE LINKE.','GB/BHE','DRP', 'WAV', 'NR', 'BP', 'FU', 'SSW', 'KPD', 'DA', 'FVP','DP','Z', 'PDS','Fraktionslos','not found', 'Gast']
+# Define Gradio App Layout
+with gr.Blocks() as App:
+    with gr.Tab("ChatBot"):
+        with gr.Blocks(fill_height=True):
+            with gr.Accordion(open=False, label="Filter"):
+                # Apply RAG using chatbot function from file chatbot.py
+                db_inputs = gr.Dropdown(choices=legislature_periods, value="All", multiselect=True, label="Legislature", info="Select a combination of legislatures as basis for the chatbot's replies", show_label=True)
+                prompt_language = gr.Dropdown(choices=["DE", "EN"], value="DE",label="Language", info="Choose output language", multiselect=False)
+            gr.ChatInterface(chatbot,
+                        title="PoliticsToYou",
+                        description= "Ask anything about your favorite political topic from any legislature period",
+                        examples=[
+                            ["Wie steht die CDU zur Cannabislegalisierung?", "All", "DE"],
+                            ["Wie steht die FDP zur Rente?", "All", "DE"],
+                            ["Was sagten die Parteien in der ersten Legislaturperiode über die nazi Vergangenheit?", "1. Legislaturperiode", "DE"],
+                            ["Wie wird die Ehe für alle diskutiert?", "18. Legislaturperiode", "DE"],
+                            ["How is the GDR perceived?", "11. Legislaturperiode", "EN"]
+                            ],
+                        cache_examples=True,  #true increases loading time
+                        additional_inputs = [db_inputs, prompt_language],
+                        )
+    with gr.Tab("KeywordSearch"):
+        with gr.Blocks() as Block:
+            # Keyword Input
+            keyword_box = gr.Textbox(label='keyword')
+            # Additional Input (hidden)
+            with gr.Accordion('Filter', open=False):
+                # Row orientation
+                with gr.Row() as additional_input:
+                    n_slider = gr.Slider(label="Number of Results",info="Other filters reduces the returned results", minimum=1, maximum=100, step=1, value=10)
+                    party_dopdown = gr.Dropdown(value='All', choices=partys, label='Party')
+                    # ToDo: Add date or legislature filter as input
+                    #start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
+                    #end_date = Calendar(value=datetime.today().strftime('%Y-%m-%d'), type="datetime", label="Select end date", info="Click the calendar icon to bring up the calendar.", interactive=True)
+            search_btn = gr.Button('Search')
+            with gr.Column(visible=False) as output_col:
+                results_df = gr.Dataframe(label='Results', interactive=False)
+                # Download results from keyword search
+                with gr.Accordion('Would you like to download your results?', open=False) as download_row:
+                    with gr.Row():
+                        ftype_dropdown = gr.Dropdown(choices=["csv","excel","json"], label="Format")
+                        export_btn = gr.Button('Export')
+                        file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
+            # Keyword Search on click
+            def search(keyword, n, party): # ToDo: Include party and timedate
+                return {
+                    output_col: gr.Column(visible=True),
+                    results_df: keyword_search(query=keyword, n=n, party_filter=party),
+                }
+            search_btn.click(
+                fn=search,
+                inputs=[keyword_box, n_slider, party_dopdown],
+                outputs=[output_col, results_df],
+            )
+            # Export data to a downloadable format
+            def export(df, keyword, ftype=None):
+                if ftype == "csv":
+                    file = f'{keyword}.csv'
+                    df.to_csv(file, index = False)
+                    return gr.File(value=file,visible=True)
+                elif ftype == "json":
+                    file = f'{keyword}.json'
+                    df.to_json(file, index = True)
+                    return gr.File(value=file,visible=True)
+                else:
+                    file = f'{keyword}.xlsx'
+                    df.to_excel(file, index = True)
+                    return gr.File(value=file,visible=True)
+            export_btn.click(
+                fn=export,
+                inputs=[results_df, keyword_box, ftype_dropdown],
+                outputs=[file],
+            )
+    with gr.Tab("About"):
+        gr.Markdown("""
+                    <h2>Welcome to <strong>PoliticsToYou</strong> - your playground for investigating the heart of politics in Germany</h2>
+                    <ul>
+                    <p>Would you like to gain insights into political debates or reveal party positions on specific topics from any legislature?</p>
+                    <br>
+                    <p>You can use the ChatBot to ask all your questions or search for related speech content in the Keyword Search section.</p>
+                    </ul>
+                    <p>Looking forward to your feedback!</p>
+                    <h3>Further improvements & Ideas:</h3>
+                    <ul>
+                        <li>Experiment with different LLMs and prompt templates</li>
+                        <li>Include chat history</li>
+                        <li>Add a legislature filter to KeywordSearch</li>
+                        <li>Exclude short document splits</li>
+                        <li>Improve inference time</li>
+                        <li>Update the database every month with the latest content</li>
+                        <li>Expand the scope to party manifestos and different countries</li>
+                    </ul>
+                    <p>Big thank you to the OpenDiscourse team for creating the underlying speeches corpus. Check out their website <a href="https://opendiscourse.de/">here</a>.</p>
+                    """
+                    )
+if __name__ == "__main__":
+    App.launch(share=False) # true not supported on hf spaces

README.md CHANGED Viewed

@@ -1,12 +1,14 @@
 ---
-title: Test
-emoji: 👀
-colorFrom: gray
-colorTo: gray
 sdk: gradio
-sdk_version: 6.6.0
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: PoliticsToYou
+emoji: 🏢
+colorFrom: pink
+colorTo: green
 sdk: gradio
+sdk_version: 4.26.0
+python_version: 3.11
+app_file: Home.py
 pinned: false
+short_description: Explore speeches from the German Bundestag since 1949.
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

requirements.txt ADDED Viewed

	@@ -0,0 +1,57 @@

+# Core Network and Async
+aiohttp>=3.11.0
+aiosignal>=1.3.1
+annotated-types>=0.7.0
+anyio>=4.8.0
+attrs>=25.1.0
+certifi>=2025.1.0
+charset-normalizer>=3.4.0
+numpy==1.26.4
+pandas<3
+Pillow<11
+MarkupSafe<3
+pydantic==2.10.6
+# Data Science and Processing
+# numpy>=2.4.1
+# pandas>=2.3.3
+# Pillow>=12.1.0
+openpyxl>=3.1.5
+et-xmlfile>=2.0.0
+mpmath>=1.3.0
+# Fixed Core Frameworks
+langchain>=1.0.0
+langchain-classic>=1.0.0
+langchain-community>=0.3.0
+langchain-core>=1.0.0
+langchain-huggingface>=1.0.0
+langchain-text-splitters>=1.0.0
+# Vector Search and AI Infrastructure
+faiss-cpu>=1.13.2
+huggingface-hub>=0.36.0
+fsspec>=2025.1.0
+filelock>=3.16.0
+sentence-transformers>=3.3.0
+# Serialization and Utilities
+orjson>=3.10.12
+jsonpatch>=1.33
+jsonpointer>=3.0.0
+marshmallow>=3.23.0
+joblib>=1.4.2
+packaging>=24.2
+idna>=3.10
+typing-extensions>=4.12.0
+# Web and UI
+# Jinja2>=3.1.5
+# MarkupSafe>=3.0.0
+# System and Performance
+greenlet>=3.1.1
+networkx>=3.4.2
+frozenlist>=1.5.0
+dataclasses-json>=0.6.7

src/FAISS/FAISS.ipynb ADDED Viewed

	@@ -0,0 +1,390 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import psycopg2\n",
+    "\n",
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "from langchain_community.document_loaders import DataFrameLoader\n",
+    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
+    "from langchain_community.vectorstores import FAISS\n",
+    "from datetime import datetime\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Retrieve Speeches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# db_connection -----------------------------------------------------------\n",
+    "con_details = {\n",
+    "    \"host\"      : \"localhost\",\n",
+    "    \"database\"  : \"next\",\n",
+    "    \"user\"      : \"postgres\",\n",
+    "    \"password\"  : \"postgres\",\n",
+    "    \"port\"      : \"5433\"\n",
+    "}\n",
+    "con = psycopg2.connect(**con_details)\n",
+    "\n",
+    "# get data tables ---------------------------------------------------------\n",
+    "df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n",
+    "                        FROM open_discourse.speeches AS s\n",
+    "                        INNER JOIN open_discourse.factions AS f ON\n",
+    "                        s.faction_id = f.id;\"\"\", con)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Process speeches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(set(df['party'].to_list()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Removing keys from interruptions of a speech\n",
+    "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) \n",
+    "df['date'] = pd.to_datetime(df['date'])\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>speech_content</th>\n",
+       "      <th>date</th>\n",
+       "      <th>party</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Meine Damen und Herren! Ich eröffne die 2. Sit...</td>\n",
+       "      <td>1949-09-12</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Der Bundesrat ist versammelt, Herr Präsident.\\n</td>\n",
+       "      <td>1949-09-12</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>Ich danke für diese Erklärung. Ich stelle dami...</td>\n",
+       "      <td>1949-09-12</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>Ja, ich habe den Wunsch.\\n</td>\n",
+       "      <td>1949-09-12</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>Ich erteile dem Herrn Bundespräsidenten das Wo...</td>\n",
+       "      <td>1949-09-12</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>930955</th>\n",
+       "      <td>1084268</td>\n",
+       "      <td>\\n\\nWir sind zwar Kollegen.</td>\n",
+       "      <td>2022-12-16</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>930956</th>\n",
+       "      <td>1084269</td>\n",
+       "      <td>\\n\\nLiebe, sehr geehrte Frau Präsidentin!</td>\n",
+       "      <td>2022-12-16</td>\n",
+       "      <td>CDU/CSU</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>930957</th>\n",
+       "      <td>1084270</td>\n",
+       "      <td>\\n\\nVielen Dank.</td>\n",
+       "      <td>2022-12-16</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>930958</th>\n",
+       "      <td>1084272</td>\n",
+       "      <td>\\n\\nDen Abschluss dieser Aktuellen Stunde bild...</td>\n",
+       "      <td>2022-12-16</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>930959</th>\n",
+       "      <td>1084273</td>\n",
+       "      <td>\\n\\nSehr geehrte Frau Präsidentin! Werte Kolle...</td>\n",
+       "      <td>2022-12-16</td>\n",
+       "      <td>SPD</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>930960 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             id                                     speech_content       date  \\\n",
+       "0             0  Meine Damen und Herren! Ich eröffne die 2. Sit... 1949-09-12   \n",
+       "1             1    Der Bundesrat ist versammelt, Herr Präsident.\\n 1949-09-12   \n",
+       "2             2  Ich danke für diese Erklärung. Ich stelle dami... 1949-09-12   \n",
+       "3             3                         Ja, ich habe den Wunsch.\\n 1949-09-12   \n",
+       "4             4  Ich erteile dem Herrn Bundespräsidenten das Wo... 1949-09-12   \n",
+       "...         ...                                                ...        ...   \n",
+       "930955  1084268                        \\n\\nWir sind zwar Kollegen. 2022-12-16   \n",
+       "930956  1084269          \\n\\nLiebe, sehr geehrte Frau Präsidentin! 2022-12-16   \n",
+       "930957  1084270                                   \\n\\nVielen Dank. 2022-12-16   \n",
+       "930958  1084272  \\n\\nDen Abschluss dieser Aktuellen Stunde bild... 2022-12-16   \n",
+       "930959  1084273  \\n\\nSehr geehrte Frau Präsidentin! Werte Kolle... 2022-12-16   \n",
+       "\n",
+       "            party  \n",
+       "0       not found  \n",
+       "1       not found  \n",
+       "2       not found  \n",
+       "3       not found  \n",
+       "4       not found  \n",
+       "...           ...  \n",
+       "930955  not found  \n",
+       "930956    CDU/CSU  \n",
+       "930957  not found  \n",
+       "930958  not found  \n",
+       "930959        SPD  \n",
+       "\n",
+       "[930960 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Convert to proper time format\n",
+    "df['date'] = pd.to_datetime(df['date'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split_documents(df, min_chunk_size=100):\n",
+    "    \"\"\"\n",
+    "    Load documents from a DataFrame, split them into smaller chunks for vector storage and remove chunks of small size.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    df : pandas.DataFrame\n",
+    "        A DataFrame containing the documents to be processed, with a column named 'speech_content'.\n",
+    "    min_chunk_size : int, optional\n",
+    "        Minimum number of characters a chunk must have to be included in the result. Default is 100.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    list\n",
+    "        A list of split document chunks ready for further processing or vectorization.\n",
+    "    \"\"\"\n",
+    "    # Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load\n",
+    "    loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')\n",
+    "    # Load the data from the DataFrame into a suitable format for processing\n",
+    "    data = loader.load()\n",
+    "    # Initialize a RecursiveCharacterTextSplitter to split the text into chunks\n",
+    "    splitter = RecursiveCharacterTextSplitter(\n",
+    "        chunk_size=1024,\n",
+    "        chunk_overlap=32,\n",
+    "        length_function=len,\n",
+    "        is_separator_regex=False,\n",
+    "    )\n",
+    "    # Split the loaded data into smaller chunks using the splitter\n",
+    "    documents = splitter.split_documents(documents=data)\n",
+    "    # Discard small chunks below the threshold\n",
+    "    cleaned_documents = [doc for doc in documents if len(doc.page_content) >= min_chunk_size]\n",
+    "\n",
+    "    return cleaned_documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sucessfully created vector store for 1. legislature\n",
+      "Sucessfully created vector store for 2. legislature\n",
+      "Sucessfully created vector store for 3. legislature\n",
+      "Sucessfully created vector store for 4. legislature\n",
+      "Sucessfully created vector store for 5. legislature\n",
+      "Sucessfully created vector store for 6. legislature\n",
+      "Sucessfully created vector store for 7. legislature\n",
+      "Sucessfully created vector store for 8. legislature\n",
+      "Sucessfully created vector store for 9. legislature\n",
+      "Sucessfully created vector store for 10. legislature\n",
+      "Sucessfully created vector store for 11. legislature\n",
+      "Sucessfully created vector store for 12. legislature\n",
+      "Sucessfully created vector store for 13. legislature\n",
+      "Sucessfully created vector store for 14. legislature\n",
+      "Sucessfully created vector store for 15. legislature\n",
+      "Sucessfully created vector store for 16. legislature\n",
+      "Sucessfully created vector store for 17. legislature\n",
+      "Sucessfully created vector store for 18. legislature\n",
+      "Sucessfully created vector store for 19. legislature\n",
+      "Sucessfully created vector store for 20. legislature\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define starting dates of legislature periods\n",
+    "dates = [\"1953-10-06\", \"1957-10-16\", \"1961-10-17\", \"1965-10-19\", \"1969-10-20\", \"1972-12-13\", \"1976-12-14\", \"1980-11-04\", \"1983-03-29\", \"1987-02-18\",\"1990-12-20\", \"1994-11-10\", \"1998-10-26\", \"2002-10-17\", \"2005-10-18\", \"2009-10-27\", \"2013-10-22\",\"2017-10-24\",\"2021-10-26\", None]\n",
+    "# Load sentence transformer \n",
+    "embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
+    "\n",
+    "# Create vector store for all speaches\n",
+    "# Split text into documents for vectorstore\n",
+    "documents = split_documents(df)\n",
+    "# Create and save faiss vectorstorage\n",
+    "index_name = 'speeches_1949_09_12'\n",
+    "db = FAISS.from_documents(documents, embeddings)\n",
+    "db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
+    "print(\"Sucessfully created vector store for all legislature\")\n",
+    "\n",
+    "# Create vector store for each legislature\n",
+    "# loop parameters\n",
+    "period = 1\n",
+    "previous_date = None\n",
+    "\n",
+    "# Iterate over all date to split by legislature getting vector stores for each period\n",
+    "for date in dates:\n",
+    "    if previous_date is None:\n",
+    "        legislature_df = df.loc[df['date'] < datetime.strptime(date, \"%Y-%m-%d\")]\n",
+    "    elif date is None:\n",
+    "        legislature_df = df.loc[df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")]\n",
+    "    else:\n",
+    "        legislature_df = df.loc[(df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")) & (df['date'] < datetime.strptime(date, \"%Y-%m-%d\"))]\n",
+    "\n",
+    "   \n",
+    "    # Split text into documents for vectorstore\n",
+    "    documents = split_documents(legislature_df)\n",
+    "\n",
+    "    # Create and save faiss vectorstorage\n",
+    "    index_name = f'{period}_legislature'\n",
+    "    db = FAISS.from_documents(documents, embeddings)\n",
+    "    db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
+    "    print(f\"Sucessfully created vector store for {period}. legislature\")\n",
+    "\n",
+    "    # Change loop parameters for next iteration\n",
+    "    period += 1\n",
+    "    previous_date = date\n",
+    "\n",
+    "\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This data has been uploaded to: https://huggingface.co/datasets/TomData/speeches-of-the-german-parliament"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/chatbot.py ADDED Viewed

	@@ -0,0 +1,286 @@

+from langchain_core.prompts import ChatPromptTemplate
+from langchain_huggingface import HuggingFaceEndpoint
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import ChatHuggingFace
+from langchain_classic.chains.combine_documents import create_stuff_documents_chain
+from langchain_classic.chains import create_retrieval_chain
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from faiss import IndexFlatL2
+import pandas as pd
+# Load environmental variables from .env-file
+from dotenv import load_dotenv, find_dotenv
+load_dotenv(find_dotenv())
+# Define important variables
+embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2") # Remove embedding input parameter from functions?
+endpoint = HuggingFaceEndpoint(
+    # ToDo: Experiment with different models here
+    repo_id="meta-llama/Llama-3.1-8B-Instruct",
+    provider="novita",
+    task="conversational",
+    max_new_tokens=512,
+    temperature=0.01,
+    top_k=30,
+    repetition_penalty=1.03,
+)
+llm = ChatHuggingFace(llm=endpoint)
+# ToDo: Experiment with different templates
+prompt_test = ChatPromptTemplate.from_template("""<s>[INST]
+                    Instruction: Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
+                    Context: {context}
+                    Question: {input}
+                    [/INST]"""
+)
+prompt_de = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
+        <context>
+        {context}
+        </context>
+        Frage: {input}
+        """
+        # Returns the answer in German
+)
+prompt_en = ChatPromptTemplate.from_template("""Answer the following question in English and solely based on the provided context:
+        <context>
+        {context}
+        </context>
+        Question: {input}
+        """
+        # Returns the answer in English
+)
+# Pre-load whole vectordatabase to reduce inference during production
+db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
+                                           embeddings=embeddings, allow_dangerous_deserialization=True)
+def get_vectorstore(inputs, embeddings):
+    """
+    Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
+    Parameters
+    ----------
+    inputs : list of str
+        A list of strings specifying which vector stores to combine. Each string represents a specific
+        index or a special keyword "All". If "All" is the first entry in the list,
+        it directly return the pre-defined vectorstore for all speeches
+    embeddings : Embeddings
+        An instance of embeddings that will be used to load the vector stores. The specific type and
+        structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
+    Returns
+    -------
+    FAISS
+        A FAISS vector store that combines the specified indices into a single vector store.
+    """
+    # Default folder path
+    folder_path = "./src/FAISS"
+    if inputs[0] == "All" or inputs[0] is None:
+        return db_all
+    # Initialize empty db
+    embedding_function = embeddings
+    dimensions = len(embedding_function.embed_query("dummy"))
+    db = FAISS(
+        embedding_function=embedding_function,
+        index=IndexFlatL2(dimensions),
+        docstore=InMemoryDocstore(),
+        index_to_docstore_id={},
+        normalize_L2=False
+    )
+    # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
+    for input in inputs:
+        # Ignore if user also selected All among other legislatures
+        if input == "All":
+            continue
+        # Retrieve selected index and merge vector stores
+        index = input.split(".")[0]
+        index_name = f'{index}_legislature'
+        local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
+                                    embeddings=embeddings, allow_dangerous_deserialization=True)
+        db.merge_from(local_db)
+        print('Successfully merged inputs')
+    return db
+def RAG(llm, prompt, db, question):
+    """
+    Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
+    language model using a predefined template.
+    Parameters:
+    ----------
+    llm : LanguageModel
+        An instance of the language model to be used for generating responses.
+    prompt : str
+        A predefined template or prompt that structures how the context and question are presented to the language model.
+    db : VectorStore
+        A vector store instance that supports retrieval of relevant documents based on the input question.
+    question : str
+        The question or query to be answered by the language model.
+    Returns:
+    -------
+    str
+        The response generated by the language model, based on the retrieved context and provided question.
+    """
+    # Create a document chain using the provided language model and prompt template
+    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
+    # Convert the vector store into a retriever
+    retriever = db.as_retriever()
+    # Create a retrieval chain that integrates the retriever with the document chain
+    retrieval_chain = create_retrieval_chain(retriever, document_chain)
+    # Invoke the retrieval chain with the input question to get the final response
+    response = retrieval_chain.invoke({"input": question})
+    return response
+async def chatbot(message, history, db_inputs, prompt_language, llm=llm):
+    """
+    Generate a response from the chatbot based on the provided message, history, database inputs, prompt language, and LLM model.
+    Parameters:
+    -----------
+    message : str
+        The message or question to be answered by the chatbot.
+    history : list
+        The history of previous interactions or messages.
+    db_inputs : list
+        A list of strings specifying which vector stores to combine. Each string represents a specific index or a special keyword "All".
+    prompt_language : str
+        The language of the prompt to be used for generating the response. Should be either "DE" for German or "EN" for English.
+    llm : LLM, optional
+        An instance of the Language Model to be used for generating the response. Defaults to the global variable `llm`.
+    Returns:
+    --------
+    str
+        The response generated by the chatbot.
+    """
+    db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
+    # Select prompt based on user input
+    prompt = prompt_de if prompt_language == "DE" else prompt_en
+    raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
+    answer_key = "answer"
+    prefix = "Antwort: " if prompt_language == "DE" else "Answer: "
+    try:
+        response = raw_response[answer_key].split(prefix)[1]
+    except (KeyError, IndexError):
+        response = raw_response.get(answer_key, "Error generating response.")
+    return str(response) # Ensure result is cast to string
+def keyword_search(query, n=10, embeddings=embeddings, method="ss", party_filter="All"):
+    """
+    Retrieve speech contents based on keywords using a specified method.
+    Parameters:
+    ----------
+    db : FAISS
+        The FAISS vector store containing speech embeddings.
+    query : str
+        The keyword(s) to search for in the speech contents.
+    n : int, optional
+        The number of speech contents to retrieve (default is 10).
+    embeddings : Embeddings, optional
+        An instance of embeddings used for embedding queries (default is embeddings).
+    method : str, optional
+        The method used for retrieving speech contents. Options are 'ss' (semantic search) and 'mmr'
+        (maximal marginal relevance) (default is 'ss').
+    party_filter : str, optional
+        A filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
+        speeches from all parties (default is 'All').
+    Returns:
+    -------
+    pandas.DataFrame
+        A DataFrame containing the speech contents, dates, and party affiliations.
+    Notes:
+    -----
+    - The `db` parameter should be a FAISS vector store containing speech embeddings.
+    - The `query` parameter specifies the keyword(s) to search for in the speech contents.
+    - The `n` parameter determines the number of speech contents to retrieve (default is 10).
+    - The `embeddings` parameter is an instance of embeddings used for embedding queries (default is embeddings).
+    - The `method` parameter specifies the method used for retrieving speech contents. Options are 'ss' (semantic search)
+      and 'mmr' (maximal marginal relevance) (default is 'ss').
+    - The `party_filter` parameter is a filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
+      speeches from all parties (default is 'All').
+    """
+    db = get_vectorstore(inputs=["All"], embeddings=embeddings)
+    query_embedding = embeddings.embed_query(query)
+    # Maximal Marginal Relevance
+    if method == "mmr":
+        df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
+        results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
+        for doc in results:
+            party = doc[0].metadata["party"]
+            if party != party_filter and party_filter != 'All':
+                continue
+            speech_content = doc[0].page_content
+            speech_date = doc[0].metadata["date"]
+            score = round(doc[1], ndigits=2)
+            df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
+                                                      'Date': [speech_date],
+                                                      'Party': [party],
+                                                      'Relevance': [score]})], ignore_index=True)
+        df_res.sort_values('Relevance', inplace=True, ascending=True)
+        return df_res
+    # Similarity Search
+    elif method == "ss":
+        kws_data = []
+        results = db.similarity_search_by_vector(query_embedding, k=n)
+        for doc in results:
+            party = doc.metadata["party"]
+            if party != party_filter and party_filter != 'All':
+                continue
+            speech_content = doc.page_content
+            speech_date = doc.metadata["date"]
+            speech_date = speech_date.strftime("%Y-%m-%d")
+            kws_entry = {'Speech Content': speech_content,
+                        'Date': speech_date,
+                        'Party': party}
+            kws_data.append(kws_entry)
+        df_res = pd.DataFrame(kws_data)
+        return df_res
+    else:
+        raise ValueError("Method must be either 'ss' or 'mmr'")