Spaces:
Sleeping
Sleeping
DialogueRobust
commited on
Commit
ยท
e490ab5
1
Parent(s):
292920a
First commit
Browse files- .gitattributes +1 -0
- README.md +6 -6
- backend.py +82 -0
- build_index.py +27 -0
- config.ini +16 -0
- data/demo_prompt.json +3 -0
- data/docs.pkl +3 -0
- data/documents.json +3 -0
- data/index.faiss +3 -0
- embeddings.py +12 -0
- requirements.txt +5 -0
- semantic.py +20 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/* filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.49.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
license:
|
| 11 |
-
short_description:
|
| 12 |
---
|
| 13 |
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Robust Dialogue Demo
|
| 3 |
+
emoji: ๐
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.49.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: cc
|
| 11 |
+
short_description: 'Domain-Specific Retrieval-Augmented Generation demo '
|
| 12 |
---
|
| 13 |
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
backend.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from semantic import SemanticSearch
|
| 2 |
+
import json
|
| 3 |
+
import argparse
|
| 4 |
+
import os
|
| 5 |
+
from embeddings import get_embedding
|
| 6 |
+
from openai import OpenAI
|
| 7 |
+
import configparser
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class BackEnd:
|
| 12 |
+
|
| 13 |
+
def __init__(self, config):
|
| 14 |
+
self.model = "gpt-4.1" #config['ChatBot']['model']
|
| 15 |
+
|
| 16 |
+
self.client = OpenAI()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
self.semantic_search = SemanticSearch()#config['Semantic Search'])
|
| 20 |
+
try:
|
| 21 |
+
with open('data/demo_prompt.json') as json_file:#config['ChatBot']['prompt file']) as json_file:
|
| 22 |
+
prompts = json.load(json_file)
|
| 23 |
+
except:
|
| 24 |
+
print(f"ERROR. Couldn't load prompt file {config['ChatBot']['prompt file']} or wrong json format")
|
| 25 |
+
quit()
|
| 26 |
+
|
| 27 |
+
lang = 'fr'#config['General']['language'][:2].lower()
|
| 28 |
+
if lang == 'fr':
|
| 29 |
+
self.prompt_template = prompts["French"]
|
| 30 |
+
elif lang == 'en':
|
| 31 |
+
self.prompt_template = prompts["English"]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def process_query(self, query):
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
query_embeddings = get_embedding(query)
|
| 41 |
+
context = self.semantic_search.search(query_embeddings)
|
| 42 |
+
for index, document in enumerate(context):
|
| 43 |
+
context[index] = 'Document ' + str(index + 1) + '\n\n' + document
|
| 44 |
+
print('context = ', context)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
documents = '\n\n'.join(context)
|
| 48 |
+
|
| 49 |
+
prompt = self.prompt_template['system_prompt']
|
| 50 |
+
demo_prefix = self.prompt_template['demo_prefix'].format(query = query, context = context)
|
| 51 |
+
prompt += demo_prefix + '\n' + documents + '\n\n'
|
| 52 |
+
demo_postfix = self.prompt_template['demo_postfix']
|
| 53 |
+
prompt += demo_postfix
|
| 54 |
+
|
| 55 |
+
if 'gpt' in self.model:
|
| 56 |
+
response = self.client.responses.create(
|
| 57 |
+
model = self.model ,
|
| 58 |
+
input= prompt)
|
| 59 |
+
|
| 60 |
+
return json.loads(response.output_text), context
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# def main():
|
| 66 |
+
# parser = argparse.ArgumentParser()
|
| 67 |
+
# parser.add_argument('--config_file', type=str, required=True, help='File containing the configuration for the backend (in .ini format)')
|
| 68 |
+
# parser.add_argument('--query', type=str, required=False, help='Test query for testing the system')
|
| 69 |
+
|
| 70 |
+
# args = parser.parse_args()
|
| 71 |
+
|
| 72 |
+
# config = configparser.ConfigParser()
|
| 73 |
+
# config.read(args.config_file)
|
| 74 |
+
# backend = BackEnd(config)
|
| 75 |
+
|
| 76 |
+
# if args.query:
|
| 77 |
+
# response = backend.process_query(args.query)
|
| 78 |
+
# print(response)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# if __name__ == '__main__':
|
| 82 |
+
# main()
|
build_index.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# build_index.py
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import pickle
|
| 5 |
+
import faiss
|
| 6 |
+
import numpy as np
|
| 7 |
+
from embeddings import get_embedding
|
| 8 |
+
|
| 9 |
+
DOCS_FILE = "data/documents.json"
|
| 10 |
+
|
| 11 |
+
with open(DOCS_FILE, "r") as f:
|
| 12 |
+
docs = json.load(f)
|
| 13 |
+
|
| 14 |
+
# Compute embeddings
|
| 15 |
+
embs = [get_embedding(d["section"]) for d in docs]
|
| 16 |
+
|
| 17 |
+
# Create FAISS index
|
| 18 |
+
dim = len(embs[0])
|
| 19 |
+
index = faiss.IndexFlatL2(dim)
|
| 20 |
+
index.add(np.array(embs).astype("float32"))
|
| 21 |
+
|
| 22 |
+
# Save outputs for the Space runtime
|
| 23 |
+
faiss.write_index(index, "data/index.faiss")
|
| 24 |
+
with open("data/docs.pkl", "wb") as f:
|
| 25 |
+
pickle.dump(docs, f)
|
| 26 |
+
|
| 27 |
+
print("โ
Index and docs saved to data/")
|
config.ini
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[General]
|
| 2 |
+
language: French
|
| 3 |
+
|
| 4 |
+
[ChatBot]
|
| 5 |
+
model: gpt-4.1
|
| 6 |
+
temperature: 0.1
|
| 7 |
+
prompt file: demo_prompt.json
|
| 8 |
+
|
| 9 |
+
[Semantic Search]
|
| 10 |
+
embedding models: dangvantuan/sentence-camembert-large Lajavaness/sentence-flaubert-base
|
| 11 |
+
database file: Livre_blanc_IA_CRIM.shorter.sections.json
|
| 12 |
+
metric: cosine distance
|
| 13 |
+
text key: section
|
| 14 |
+
topk: 5
|
| 15 |
+
|
| 16 |
+
|
data/demo_prompt.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d11c98d850aa30740ac3a39757e0e6216da3d66adda7e8e7f588f2e6dbbaa04a
|
| 3 |
+
size 3593
|
data/docs.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb5f2382efad046adb66db5012e7f9e46e53a047f08ac9a2cbbbfb7b0ce99691
|
| 3 |
+
size 224530
|
data/documents.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:81b954096e0387165987cbc0acac8570eda33513a95108da04d481fd228b4af2
|
| 3 |
+
size 229378
|
data/index.faiss
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:78b0060c9e9c4d574a03d588c2e843a271f82206950ff2c6c00e3610f30ad891
|
| 3 |
+
size 823341
|
embeddings.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# embeddings.py
|
| 2 |
+
import os
|
| 3 |
+
from openai import OpenAI
|
| 4 |
+
|
| 5 |
+
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
|
| 6 |
+
|
| 7 |
+
def get_embedding(text: str, model="text-embedding-3-small"):
|
| 8 |
+
response = client.embeddings.create(
|
| 9 |
+
model=model,
|
| 10 |
+
input=text
|
| 11 |
+
)
|
| 12 |
+
return response.data[0].embedding
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
openai>=1.0.0
|
| 3 |
+
faiss-cpu
|
| 4 |
+
numpy
|
| 5 |
+
pickle5
|
semantic.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import faiss
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pickle
|
| 4 |
+
|
| 5 |
+
INDEX_PATH = "data/index.faiss"
|
| 6 |
+
DOCS_PATH = "data/docs.pkl"
|
| 7 |
+
|
| 8 |
+
class SemanticSearch:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.index = faiss.read_index(INDEX_PATH)
|
| 11 |
+
with open(DOCS_PATH, "rb") as f:
|
| 12 |
+
self.docs = pickle.load(f)
|
| 13 |
+
|
| 14 |
+
def search(self, query_emb, k=3):
|
| 15 |
+
D, I = self.index.search(np.array([query_emb]).astype("float32"), k)
|
| 16 |
+
results = []
|
| 17 |
+
for idx in I[0]:
|
| 18 |
+
if idx >= 0:
|
| 19 |
+
results.append(self.docs[idx]['section'])
|
| 20 |
+
return results
|