Yingfeng
commited on
Commit
·
f23a141
1
Parent(s):
f05a941
Synchronize with enterprise version (#4325)
Browse files### Type of change
- [x] Refactoring
- agent/templates/customer_service.json +3 -3
- rag/app/knowledge_graph.py +2 -2
- rag/app/manual.py +2 -2
- rag/llm/chat_model.py +1 -1
- rag/llm/tts_model.py +27 -2
- rag/svr/cache_file_svr.py +59 -59
agent/templates/customer_service.json
CHANGED
|
@@ -336,7 +336,7 @@
|
|
| 336 |
"parameters": [],
|
| 337 |
"presencePenaltyEnabled": true,
|
| 338 |
"presence_penalty": 0.4,
|
| 339 |
-
"prompt": "Role: You are a customer support. \n\nTask: Please answer the question based on content of knowledge base. \n\
|
| 340 |
"temperature": 0.1,
|
| 341 |
"temperatureEnabled": true,
|
| 342 |
"topPEnabled": true,
|
|
@@ -603,7 +603,7 @@
|
|
| 603 |
{
|
| 604 |
"data": {
|
| 605 |
"form": {
|
| 606 |
-
"text": "Static messages.\nDefine
|
| 607 |
},
|
| 608 |
"label": "Note",
|
| 609 |
"name": "N: What else?"
|
|
@@ -691,7 +691,7 @@
|
|
| 691 |
{
|
| 692 |
"data": {
|
| 693 |
"form": {
|
| 694 |
-
"text": "Complete questions by conversation history.\nUser: What's RAGFlow?\nAssistant: RAGFlow is xxx.\nUser: How to
|
| 695 |
},
|
| 696 |
"label": "Note",
|
| 697 |
"name": "N: Refine Question"
|
|
|
|
| 336 |
"parameters": [],
|
| 337 |
"presencePenaltyEnabled": true,
|
| 338 |
"presence_penalty": 0.4,
|
| 339 |
+
"prompt": "Role: You are a customer support. \n\nTask: Please answer the question based on content of knowledge base. \n\nRequirements & restrictions:\n - DO NOT make things up when all knowledge base content is irrelevant to the question. \n - Answers need to consider chat history.\n - Request about customer's contact information like, Wechat number, LINE number, twitter, discord, etc,. , when knowledge base content can't answer his question. So, product expert could contact him soon to solve his problem.\n\n Knowledge base content is as following:\n {input}\n The above is the content of knowledge base.",
|
| 340 |
"temperature": 0.1,
|
| 341 |
"temperatureEnabled": true,
|
| 342 |
"topPEnabled": true,
|
|
|
|
| 603 |
{
|
| 604 |
"data": {
|
| 605 |
"form": {
|
| 606 |
+
"text": "Static messages.\nDefine response after receive user's contact information."
|
| 607 |
},
|
| 608 |
"label": "Note",
|
| 609 |
"name": "N: What else?"
|
|
|
|
| 691 |
{
|
| 692 |
"data": {
|
| 693 |
"form": {
|
| 694 |
+
"text": "Complete questions by conversation history.\nUser: What's RAGFlow?\nAssistant: RAGFlow is xxx.\nUser: How to deploy it?\n\nRefine it: How to deploy RAGFlow?"
|
| 695 |
},
|
| 696 |
"label": "Note",
|
| 697 |
"name": "N: Refine Question"
|
rag/app/knowledge_graph.py
CHANGED
|
@@ -9,7 +9,7 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
|
|
| 9 |
lang="Chinese", callback=None, **kwargs):
|
| 10 |
parser_config = kwargs.get(
|
| 11 |
"parser_config", {
|
| 12 |
-
"chunk_token_num": 512, "delimiter": "\n
|
| 13 |
eng = lang.lower() == "english"
|
| 14 |
|
| 15 |
parser_config["layout_recognize"] = True
|
|
@@ -29,4 +29,4 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
|
|
| 29 |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
| 30 |
chunks.extend(tokenize_chunks(sections, doc, eng))
|
| 31 |
|
| 32 |
-
return chunks
|
|
|
|
| 9 |
lang="Chinese", callback=None, **kwargs):
|
| 10 |
parser_config = kwargs.get(
|
| 11 |
"parser_config", {
|
| 12 |
+
"chunk_token_num": 512, "delimiter": "\n!?;。;!?", "layout_recognize": True})
|
| 13 |
eng = lang.lower() == "english"
|
| 14 |
|
| 15 |
parser_config["layout_recognize"] = True
|
|
|
|
| 29 |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
| 30 |
chunks.extend(tokenize_chunks(sections, doc, eng))
|
| 31 |
|
| 32 |
+
return chunks
|
rag/app/manual.py
CHANGED
|
@@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 256 |
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
| 257 |
return res
|
| 258 |
|
| 259 |
-
elif re.search(r"\.docx
|
| 260 |
docx_parser = Docx()
|
| 261 |
ti_list, tbls = docx_parser(filename, binary,
|
| 262 |
from_page=0, to_page=10000, callback=callback)
|
|
@@ -279,4 +279,4 @@ if __name__ == "__main__":
|
|
| 279 |
pass
|
| 280 |
|
| 281 |
|
| 282 |
-
chunk(sys.argv[1], callback=dummy)
|
|
|
|
| 256 |
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
| 257 |
return res
|
| 258 |
|
| 259 |
+
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
|
| 260 |
docx_parser = Docx()
|
| 261 |
ti_list, tbls = docx_parser(filename, binary,
|
| 262 |
from_page=0, to_page=10000, callback=callback)
|
|
|
|
| 279 |
pass
|
| 280 |
|
| 281 |
|
| 282 |
+
chunk(sys.argv[1], callback=dummy)
|
rag/llm/chat_model.py
CHANGED
|
@@ -24,7 +24,6 @@ import openai
|
|
| 24 |
from ollama import Client
|
| 25 |
from rag.nlp import is_chinese, is_english
|
| 26 |
from rag.utils import num_tokens_from_string
|
| 27 |
-
from groq import Groq
|
| 28 |
import os
|
| 29 |
import json
|
| 30 |
import requests
|
|
@@ -840,6 +839,7 @@ class GeminiChat(Base):
|
|
| 840 |
|
| 841 |
class GroqChat:
|
| 842 |
def __init__(self, key, model_name, base_url=''):
|
|
|
|
| 843 |
self.client = Groq(api_key=key)
|
| 844 |
self.model_name = model_name
|
| 845 |
|
|
|
|
| 24 |
from ollama import Client
|
| 25 |
from rag.nlp import is_chinese, is_english
|
| 26 |
from rag.utils import num_tokens_from_string
|
|
|
|
| 27 |
import os
|
| 28 |
import json
|
| 29 |
import requests
|
|
|
|
| 839 |
|
| 840 |
class GroqChat:
|
| 841 |
def __init__(self, key, model_name, base_url=''):
|
| 842 |
+
from groq import Groq
|
| 843 |
self.client = Groq(api_key=key)
|
| 844 |
self.model_name = model_name
|
| 845 |
|
rag/llm/tts_model.py
CHANGED
|
@@ -299,8 +299,6 @@ class SparkTTS:
|
|
| 299 |
yield audio_chunk
|
| 300 |
|
| 301 |
|
| 302 |
-
|
| 303 |
-
|
| 304 |
class XinferenceTTS:
|
| 305 |
def __init__(self, key, model_name, **kwargs):
|
| 306 |
self.base_url = kwargs.get("base_url", None)
|
|
@@ -330,3 +328,30 @@ class XinferenceTTS:
|
|
| 330 |
for chunk in response.iter_content(chunk_size=1024):
|
| 331 |
if chunk:
|
| 332 |
yield chunk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
yield audio_chunk
|
| 300 |
|
| 301 |
|
|
|
|
|
|
|
| 302 |
class XinferenceTTS:
|
| 303 |
def __init__(self, key, model_name, **kwargs):
|
| 304 |
self.base_url = kwargs.get("base_url", None)
|
|
|
|
| 328 |
for chunk in response.iter_content(chunk_size=1024):
|
| 329 |
if chunk:
|
| 330 |
yield chunk
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
class OllamaTTS(Base):
|
| 334 |
+
def __init__(self, key, model_name="ollama-tts", base_url="https://api.ollama.ai/v1"):
|
| 335 |
+
if not base_url:
|
| 336 |
+
base_url = "https://api.ollama.ai/v1"
|
| 337 |
+
self.model_name = model_name
|
| 338 |
+
self.base_url = base_url
|
| 339 |
+
self.headers = {
|
| 340 |
+
"Content-Type": "application/json"
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
def tts(self, text, voice="standard-voice"):
|
| 344 |
+
payload = {
|
| 345 |
+
"model": self.model_name,
|
| 346 |
+
"voice": voice,
|
| 347 |
+
"input": text
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
response = requests.post(f"{self.base_url}/audio/tts", headers=self.headers, json=payload, stream=True)
|
| 351 |
+
|
| 352 |
+
if response.status_code != 200:
|
| 353 |
+
raise Exception(f"**Error**: {response.status_code}, {response.text}")
|
| 354 |
+
|
| 355 |
+
for chunk in response.iter_content():
|
| 356 |
+
if chunk:
|
| 357 |
+
yield chunk
|
rag/svr/cache_file_svr.py
CHANGED
|
@@ -1,60 +1,60 @@
|
|
| 1 |
-
#
|
| 2 |
-
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
| 3 |
-
#
|
| 4 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
-
# you may not use this file except in compliance with the License.
|
| 6 |
-
# You may obtain a copy of the License at
|
| 7 |
-
#
|
| 8 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
-
#
|
| 10 |
-
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
-
# See the License for the specific language governing permissions and
|
| 14 |
-
# limitations under the License.
|
| 15 |
-
#
|
| 16 |
-
import logging
|
| 17 |
-
import time
|
| 18 |
-
import traceback
|
| 19 |
-
|
| 20 |
-
from api.db.db_models import close_connection
|
| 21 |
-
from api.db.services.task_service import TaskService
|
| 22 |
-
from rag.utils.
|
| 23 |
-
from rag.utils.redis_conn import REDIS_CONN
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def collect():
|
| 27 |
-
doc_locations = TaskService.get_ongoing_doc_name()
|
| 28 |
-
logging.debug(doc_locations)
|
| 29 |
-
if len(doc_locations) == 0:
|
| 30 |
-
time.sleep(1)
|
| 31 |
-
return
|
| 32 |
-
return doc_locations
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
if __name__ == "__main__":
|
| 57 |
-
while True:
|
| 58 |
-
main()
|
| 59 |
-
close_connection()
|
| 60 |
time.sleep(1)
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
#
|
| 16 |
+
import logging
|
| 17 |
+
import time
|
| 18 |
+
import traceback
|
| 19 |
+
|
| 20 |
+
from api.db.db_models import close_connection
|
| 21 |
+
from api.db.services.task_service import TaskService
|
| 22 |
+
from rag.utils.minio_conn import MINIOs
|
| 23 |
+
from rag.utils.redis_conn import REDIS_CONN
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def collect():
|
| 27 |
+
doc_locations = TaskService.get_ongoing_doc_name()
|
| 28 |
+
logging.debug(doc_locations)
|
| 29 |
+
if len(doc_locations) == 0:
|
| 30 |
+
time.sleep(1)
|
| 31 |
+
return
|
| 32 |
+
return doc_locations
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def main():
|
| 36 |
+
locations = collect()
|
| 37 |
+
if not locations:
|
| 38 |
+
return
|
| 39 |
+
logging.info(f"TASKS: {len(locations)}")
|
| 40 |
+
for kb_id, loc in locations:
|
| 41 |
+
try:
|
| 42 |
+
if REDIS_CONN.is_alive():
|
| 43 |
+
try:
|
| 44 |
+
key = "{}/{}".format(kb_id, loc)
|
| 45 |
+
if REDIS_CONN.exist(key):
|
| 46 |
+
continue
|
| 47 |
+
file_bin = MINIOs.get(kb_id, loc)
|
| 48 |
+
REDIS_CONN.transaction(key, file_bin, 12 * 60)
|
| 49 |
+
logging.info("CACHE: {}".format(loc))
|
| 50 |
+
except Exception as e:
|
| 51 |
+
traceback.print_stack(e)
|
| 52 |
+
except Exception as e:
|
| 53 |
+
traceback.print_stack(e)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
while True:
|
| 58 |
+
main()
|
| 59 |
+
close_connection()
|
| 60 |
time.sleep(1)
|