Rulga commited on
Commit
ce09d77
·
0 Parent(s):

add new files

Browse files
Files changed (7) hide show
  1. .gitignore +6 -0
  2. README.md +15 -0
  3. app.py +210 -0
  4. gitattributes +44 -0
  5. gitignore +4 -0
  6. requirements.txt +9 -0
  7. two-in-one.py +150 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ /.streamlit
2
+ *.env
3
+ .env
4
+ venv
5
+ .streamlit/secrets.toml
6
+
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: New LS Chatbot App
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.42.0
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: It is a chat built with an AI model about www.Status.law
11
+ ---
12
+
13
+ # LS Chatbot App
14
+
15
+ It is a chat app built using Streamlit that allows users to interact with an AI model to communicate about www.Status.law
app.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import streamlit as st
4
+ from dotenv import load_dotenv
5
+ from langchain_groq import ChatGroq
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ from langchain_community.document_loaders import WebBaseLoader
10
+ from langchain_core.prompts import PromptTemplate
11
+ from langchain_core.output_parsers import StrOutputParser
12
+ from langchain_core.runnables import RunnableLambda
13
+ import requests
14
+ import json
15
+
16
+ # Page configuration
17
+ st.set_page_config(page_title="Status Law Assistant", page_icon="⚖️")
18
+
19
+ # Knowledge base info in session_state
20
+ if 'kb_info' not in st.session_state:
21
+ st.session_state.kb_info = {
22
+ 'build_time': None,
23
+ 'size': None
24
+ }
25
+
26
+ # Display title and knowledge base info
27
+ # st.title("www.Status.Law Legal Assistant")
28
+
29
+ st.markdown(
30
+ '''
31
+ <h1>
32
+ ⚖️
33
+ <a href="https://status.law/" style="text-decoration: underline; color: blue; font-size: inherit;">
34
+ Status.Law
35
+ </a>
36
+ Legal Assistant
37
+ </h1>
38
+ ''',
39
+ unsafe_allow_html=True
40
+ )
41
+
42
+ if st.session_state.kb_info['build_time'] and st.session_state.kb_info['size']:
43
+ st.caption(f"(Knowledge base build time: {st.session_state.kb_info['build_time']:.2f} seconds, "
44
+ f"size: {st.session_state.kb_info['size']:.2f} MB)")
45
+
46
+ # Path to store vector database
47
+ VECTOR_STORE_PATH = "vector_store"
48
+
49
+ # Создание папки истории, если она не существует
50
+ if not os.path.exists("chat_history"):
51
+ os.makedirs("chat_history")
52
+
53
+ # Website URLs
54
+ urls = [
55
+ "https://status.law",
56
+ "https://status.law/about",
57
+ "https://status.law/careers",
58
+ "https://status.law/challenging-sanctions",
59
+ "https://status.law/law-firm-contact-legal-protection"
60
+ "https://status.law/cross-border-banking-legal-issues",
61
+ "https://status.law/extradition-defense",
62
+ "https://status.law/international-prosecution-protection",
63
+ "https://status.law/interpol-red-notice-removal",
64
+ "https://status.law/practice-areas",
65
+ "https://status.law/reputation-protection",
66
+ "https://status.law/faq"
67
+ ]
68
+
69
+ # Load secrets
70
+ try:
71
+ GROQ_API_KEY = st.secrets["GROQ_API_KEY"]
72
+ except Exception as e:
73
+ st.error("Error loading secrets. Please check your configuration.")
74
+ st.stop()
75
+
76
+ # Initialize models
77
+ @st.cache_resource
78
+ def init_models():
79
+ llm = ChatGroq(
80
+ model_name="llama-3.3-70b-versatile",
81
+ temperature=0.6,
82
+ api_key=GROQ_API_KEY
83
+ )
84
+ embeddings = HuggingFaceEmbeddings(
85
+ model_name="intfloat/multilingual-e5-large-instruct"
86
+ )
87
+ return llm, embeddings
88
+
89
+ # Build knowledge base
90
+ def build_knowledge_base(embeddings):
91
+ start_time = time.time()
92
+
93
+ documents = []
94
+ with st.status("Loading website content...") as status:
95
+ for url in urls:
96
+ try:
97
+ loader = WebBaseLoader(url)
98
+ docs = loader.load()
99
+ documents.extend(docs)
100
+ status.update(label=f"Loaded {url}")
101
+ except Exception as e:
102
+ st.error(f"Error loading {url}: {str(e)}")
103
+
104
+ text_splitter = RecursiveCharacterTextSplitter(
105
+ chunk_size=500,
106
+ chunk_overlap=100
107
+ )
108
+ chunks = text_splitter.split_documents(documents)
109
+
110
+ vector_store = FAISS.from_documents(chunks, embeddings)
111
+ vector_store.save_local(VECTOR_STORE_PATH)
112
+
113
+ end_time = time.time()
114
+ build_time = end_time - start_time
115
+
116
+ # Calculate knowledge base size
117
+ total_size = 0
118
+ for path, dirs, files in os.walk(VECTOR_STORE_PATH):
119
+ for f in files:
120
+ fp = os.path.join(path, f)
121
+ total_size += os.path.getsize(fp)
122
+ size_mb = total_size / (1024 * 1024)
123
+
124
+ # Save knowledge base info
125
+ st.session_state.kb_info['build_time'] = build_time
126
+ st.session_state.kb_info['size'] = size_mb
127
+
128
+ st.success(f"""
129
+ Knowledge base created successfully:
130
+ - Time taken: {build_time:.2f} seconds
131
+ - Size: {size_mb:.2f} MB
132
+ - Number of chunks: {len(chunks)}
133
+ """)
134
+
135
+ return vector_store
136
+
137
+ # Main function
138
+ def main():
139
+ # Initialize models
140
+ llm, embeddings = init_models()
141
+
142
+ # Check if knowledge base exists
143
+ if not os.path.exists(VECTOR_STORE_PATH):
144
+ st.warning("Knowledge base not found.")
145
+ if st.button("Create Knowledge Base"):
146
+ vector_store = build_knowledge_base(embeddings)
147
+ st.session_state.vector_store = vector_store
148
+ st.rerun()
149
+ else:
150
+ if 'vector_store' not in st.session_state:
151
+ st.session_state.vector_store = FAISS.load_local(
152
+ VECTOR_STORE_PATH,
153
+ embeddings,
154
+ allow_dangerous_deserialization=True
155
+ )
156
+
157
+ # Chat mode
158
+ if 'vector_store' in st.session_state:
159
+ if 'messages' not in st.session_state:
160
+ st.session_state.messages = []
161
+
162
+ # Display chat history
163
+ for message in st.session_state.messages:
164
+ st.chat_message("user").write(message["question"])
165
+ st.chat_message("assistant").write(message["answer"])
166
+
167
+ # User input
168
+ if question := st.chat_input("Ask your question"):
169
+ st.chat_message("user").write(question)
170
+
171
+ # Retrieve context and generate response
172
+ with st.chat_message("assistant"):
173
+ with st.spinner("Thinking..."):
174
+ context = st.session_state.vector_store.similarity_search(question)
175
+ context_text = "\n".join([doc.page_content for doc in context])
176
+
177
+ prompt = PromptTemplate.from_template("""
178
+ You are a helpful and polite legal assistant at Status Law.
179
+ You answer in the language in which the question was asked.
180
+ Answer the question based on the context provided.
181
+ If you cannot answer based on the context, say so politely and offer to contact Status Law directly via the following channels:
182
+ - For all users: +32465594521 (landline phone).
183
+ - For English and Swedish speakers only: +46728495129 (available on WhatsApp, Telegram, Signal, IMO).
184
+ - Provide a link to the contact form: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
185
+ Answer professionally but in a friendly manner.
186
+
187
+ Example:
188
+ Q: How can I challenge the sanctions?
189
+ A: To challenge the sanctions, you should consult with our legal team, who specialize in this area. Please contact us directly for detailed advice. You can fill out our contact form here: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
190
+
191
+ Context: {context}
192
+ Question: {question}
193
+ """)
194
+
195
+ chain = prompt | llm | StrOutputParser()
196
+ response = chain.invoke({
197
+ "context": context_text,
198
+ "question": question
199
+ })
200
+
201
+ st.write(response)
202
+
203
+ # Save chat history
204
+ st.session_state.messages.append({
205
+ "question": question,
206
+ "answer": response
207
+ })
208
+
209
+ if __name__ == "__main__":
210
+ main()
gitattributes ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+
37
+
38
+ * text=auto eol=crlf
39
+
40
+ *.bin binary
41
+
42
+ .gitignore text eol=lf
43
+ .gitattributes text eol=lf
44
+
gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.env
2
+
3
+ venv
4
+ .streamlit/secrets.toml
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain-community
3
+ langchain-core
4
+ langchain-huggingface
5
+ langchain-groq
6
+ python-dotenv
7
+ beautifulsoup4
8
+ faiss-cpu
9
+ requests
two-in-one.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from langchain_groq import ChatGroq
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain_community.document_loaders import WebBaseLoader
8
+ from langchain_core.prompts import PromptTemplate
9
+ from langchain_core.output_parsers import StrOutputParser
10
+ from langchain_core.runnables import RunnablePassthrough, RunnableLambda
11
+ from requests.exceptions import RequestException, Timeout
12
+
13
+ # Загрузка переменных окружения
14
+ if os.path.exists(".env"):
15
+ load_dotenv(verbose=True)
16
+
17
+ # Загрузка API-ключей
18
+ try:
19
+ GROQ_API_KEY = st.secrets["GROQ_API_KEY"]
20
+ USER_AGENT = st.secrets["USER_AGENT"]
21
+ LANGSMITH_TRACING = st.secrets["LANGSMITH_TRACING"]
22
+ LANGSMITH_ENDPOINT = st.secrets["LANGSMITH_ENDPOINT"]
23
+ LANGSMITH_API_KEY = st.secrets["LANGSMITH_API_KEY"]
24
+ LANGSMITH_PROJECT = st.secrets["LANGSMITH_PROJECT"]
25
+ OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
26
+ except FileNotFoundError:
27
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
28
+ USER_AGENT = os.getenv("USER_AGENT")
29
+ LANGSMITH_TRACING = os.getenv("LANGSMITH_TRACING")
30
+ LANGSMITH_ENDPOINT = os.getenv("LANGSMITH_ENDPOINT")
31
+ LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")
32
+ LANGSMITH_PROJECT = os.getenv("LANGSMITH_PROJECT")
33
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
34
+
35
+ # Проверка API-ключей
36
+ if not all([GROQ_API_KEY, USER_AGENT, LANGSMITH_TRACING, LANGSMITH_ENDPOINT, LANGSMITH_API_KEY, LANGSMITH_PROJECT, OPENAI_API_KEY]):
37
+ st.error("Ошибка: Не все переменные окружения заданы.")
38
+ st.stop()
39
+
40
+ # Инициализация LLM
41
+ try:
42
+ llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.6, api_key=GROQ_API_KEY)
43
+ print("[DEBUG] LLM успешно инициализирован")
44
+ except Exception as e:
45
+ st.error(f"Ошибка инициализации LLM: {e}")
46
+ st.stop()
47
+
48
+ # Инициализация эмбеддингов
49
+ embeddings_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large-instruct")
50
+ print("[DEBUG] Модель эмбеддингов загружена")
51
+
52
+ # Список страниц для анализа
53
+ urls = [
54
+ "https://status.law",
55
+ "https://status.law/about",
56
+ "https://status.law/careers",
57
+ "https://status.law/challenging-sanctions",
58
+ "https://status.law/contact",
59
+ "https://status.law/cross-border-banking-legal-issues",
60
+ "https://status.law/extradition-defense",
61
+ "https://status.law/international-prosecution-protection",
62
+ "https://status.law/interpol-red-notice-removal",
63
+ "https://status.law/practice-areas",
64
+ "https://status.law/reputation-protection",
65
+ "https://status.law/faq"
66
+ ]
67
+
68
+ # Путь к файлу векторного хранилища
69
+ VECTOR_STORE_PATH = "vector_store"
70
+
71
+ # Функция для создания базы знаний
72
+ def build_knowledge_base():
73
+ documents = []
74
+ for url in urls:
75
+ try:
76
+ loader = WebBaseLoader(url)
77
+ documents.extend(loader.load(timeout=10))
78
+ st.write(f"[DEBUG] Загружен контент с {url}")
79
+ except (RequestException, Timeout) as e:
80
+ st.write(f"[ERROR] Ошибка загрузки страницы {url}: {e}")
81
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
82
+ chunks = text_splitter.split_documents(documents)
83
+ st.write(f"[DEBUG] Разбито на {len(chunks)} фрагментов")
84
+ vector_store = FAISS.from_documents(chunks, embeddings_model)
85
+ vector_store.save_local(VECTOR_STORE_PATH)
86
+ st.write("[DEBUG] Векторное хранилище создано и сохранено")
87
+ return vector_store
88
+
89
+ # Функция для загрузки базы знаний
90
+ @st.cache_resource
91
+ def load_knowledge_base():
92
+ if os.path.exists(VECTOR_STORE_PATH):
93
+ st.write("[DEBUG] Загрузка существующего векторного хранилища")
94
+ return FAISS.load_local(VECTOR_STORE_PATH, embeddings_model)
95
+ else:
96
+ st.write("[DEBUG] Векторное хранилище не найдено, создание нового")
97
+ return build_knowledge_base()
98
+
99
+ # Загрузка или создание базы знаний
100
+ vector_store = load_knowledge_base()
101
+
102
+ # Промпт для бота
103
+ template = """
104
+ You are a helpful legal assistant that answers questions based on information from status.law.
105
+ Answer accurately and concisely.
106
+ Question: {question}
107
+ Only use the provided context to answer the question.
108
+ Context: {context}
109
+ """
110
+ prompt = PromptTemplate.from_template(template)
111
+
112
+ # Инициализация цепочки обработки запроса
113
+ if "chain" not in st.session_state:
114
+ st.session_state.chain = (
115
+ RunnableLambda(lambda x: {"context": x["context"], "question": x["question"]})
116
+ | prompt
117
+ | llm
118
+ | StrOutputParser()
119
+ )
120
+ chain = st.session_state.chain
121
+
122
+ # Интерфейс Streamlit
123
+ st.set_page_config(page_title="Legal Chatbot", page_icon="🤖")
124
+ st.title("🤖 Legal Chatbot")
125
+ st.write("Этот бот отвечает на юридические вопросы, используя информацию с сайта status.law.")
126
+
127
+ # Поле для ввода вопроса
128
+ user_input = st.text_input("Введите ваш вопрос:")
129
+ if st.button("Отправить") and user_input:
130
+ # Поиск релевантных документов
131
+ retrieved_docs = vector_store.similarity_search(user_input)
132
+ context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
133
+
134
+ # Генерация ответа
135
+ response = chain.invoke({"question": user_input, "context": context_text})
136
+
137
+ # Сохранение истории сообщений
138
+ if "message_history" not in st.session_state:
139
+ st.session_state.message_history = []
140
+ st.session_state.message_history.append({"question": user_input, "answer": response})
141
+
142
+ # Вывод ответа
143
+ st.write(response)
144
+
145
+ # Вывод истории сообщений
146
+ if "message_history" in st.session_state:
147
+ st.write("### История сообщений")
148
+ for msg in st.session_state.message_history:
149
+ st.write(f"**User:** {msg['question']}")
150
+ st.write(f"**Bot:** {msg['answer']}")