DialogueRobust commited on
Commit
e490ab5
ยท
1 Parent(s): 292920a

First commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
- title: RobustDialogueDemo
3
- emoji: ๐Ÿ“š
4
- colorFrom: gray
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
- short_description: Robust reasoning demo
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Robust Dialogue Demo
3
+ emoji: ๐Ÿƒ
4
+ colorFrom: purple
5
+ colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
+ license: cc
11
+ short_description: 'Domain-Specific Retrieval-Augmented Generation demo '
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
backend.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from semantic import SemanticSearch
2
+ import json
3
+ import argparse
4
+ import os
5
+ from embeddings import get_embedding
6
+ from openai import OpenAI
7
+ import configparser
8
+
9
+
10
+
11
+ class BackEnd:
12
+
13
+ def __init__(self, config):
14
+ self.model = "gpt-4.1" #config['ChatBot']['model']
15
+
16
+ self.client = OpenAI()
17
+
18
+
19
+ self.semantic_search = SemanticSearch()#config['Semantic Search'])
20
+ try:
21
+ with open('data/demo_prompt.json') as json_file:#config['ChatBot']['prompt file']) as json_file:
22
+ prompts = json.load(json_file)
23
+ except:
24
+ print(f"ERROR. Couldn't load prompt file {config['ChatBot']['prompt file']} or wrong json format")
25
+ quit()
26
+
27
+ lang = 'fr'#config['General']['language'][:2].lower()
28
+ if lang == 'fr':
29
+ self.prompt_template = prompts["French"]
30
+ elif lang == 'en':
31
+ self.prompt_template = prompts["English"]
32
+
33
+
34
+
35
+
36
+
37
+ def process_query(self, query):
38
+
39
+
40
+ query_embeddings = get_embedding(query)
41
+ context = self.semantic_search.search(query_embeddings)
42
+ for index, document in enumerate(context):
43
+ context[index] = 'Document ' + str(index + 1) + '\n\n' + document
44
+ print('context = ', context)
45
+
46
+
47
+ documents = '\n\n'.join(context)
48
+
49
+ prompt = self.prompt_template['system_prompt']
50
+ demo_prefix = self.prompt_template['demo_prefix'].format(query = query, context = context)
51
+ prompt += demo_prefix + '\n' + documents + '\n\n'
52
+ demo_postfix = self.prompt_template['demo_postfix']
53
+ prompt += demo_postfix
54
+
55
+ if 'gpt' in self.model:
56
+ response = self.client.responses.create(
57
+ model = self.model ,
58
+ input= prompt)
59
+
60
+ return json.loads(response.output_text), context
61
+
62
+
63
+
64
+
65
+ # def main():
66
+ # parser = argparse.ArgumentParser()
67
+ # parser.add_argument('--config_file', type=str, required=True, help='File containing the configuration for the backend (in .ini format)')
68
+ # parser.add_argument('--query', type=str, required=False, help='Test query for testing the system')
69
+
70
+ # args = parser.parse_args()
71
+
72
+ # config = configparser.ConfigParser()
73
+ # config.read(args.config_file)
74
+ # backend = BackEnd(config)
75
+
76
+ # if args.query:
77
+ # response = backend.process_query(args.query)
78
+ # print(response)
79
+
80
+
81
+ # if __name__ == '__main__':
82
+ # main()
build_index.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_index.py
2
+ import os
3
+ import json
4
+ import pickle
5
+ import faiss
6
+ import numpy as np
7
+ from embeddings import get_embedding
8
+
9
+ DOCS_FILE = "data/documents.json"
10
+
11
+ with open(DOCS_FILE, "r") as f:
12
+ docs = json.load(f)
13
+
14
+ # Compute embeddings
15
+ embs = [get_embedding(d["section"]) for d in docs]
16
+
17
+ # Create FAISS index
18
+ dim = len(embs[0])
19
+ index = faiss.IndexFlatL2(dim)
20
+ index.add(np.array(embs).astype("float32"))
21
+
22
+ # Save outputs for the Space runtime
23
+ faiss.write_index(index, "data/index.faiss")
24
+ with open("data/docs.pkl", "wb") as f:
25
+ pickle.dump(docs, f)
26
+
27
+ print("โœ… Index and docs saved to data/")
config.ini ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [General]
2
+ language: French
3
+
4
+ [ChatBot]
5
+ model: gpt-4.1
6
+ temperature: 0.1
7
+ prompt file: demo_prompt.json
8
+
9
+ [Semantic Search]
10
+ embedding models: dangvantuan/sentence-camembert-large Lajavaness/sentence-flaubert-base
11
+ database file: Livre_blanc_IA_CRIM.shorter.sections.json
12
+ metric: cosine distance
13
+ text key: section
14
+ topk: 5
15
+
16
+
data/demo_prompt.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d11c98d850aa30740ac3a39757e0e6216da3d66adda7e8e7f588f2e6dbbaa04a
3
+ size 3593
data/docs.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb5f2382efad046adb66db5012e7f9e46e53a047f08ac9a2cbbbfb7b0ce99691
3
+ size 224530
data/documents.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81b954096e0387165987cbc0acac8570eda33513a95108da04d481fd228b4af2
3
+ size 229378
data/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78b0060c9e9c4d574a03d588c2e843a271f82206950ff2c6c00e3610f30ad891
3
+ size 823341
embeddings.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # embeddings.py
2
+ import os
3
+ from openai import OpenAI
4
+
5
+ client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
6
+
7
+ def get_embedding(text: str, model="text-embedding-3-small"):
8
+ response = client.embeddings.create(
9
+ model=model,
10
+ input=text
11
+ )
12
+ return response.data[0].embedding
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ openai>=1.0.0
3
+ faiss-cpu
4
+ numpy
5
+ pickle5
semantic.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ import pickle
4
+
5
+ INDEX_PATH = "data/index.faiss"
6
+ DOCS_PATH = "data/docs.pkl"
7
+
8
+ class SemanticSearch:
9
+ def __init__(self):
10
+ self.index = faiss.read_index(INDEX_PATH)
11
+ with open(DOCS_PATH, "rb") as f:
12
+ self.docs = pickle.load(f)
13
+
14
+ def search(self, query_emb, k=3):
15
+ D, I = self.index.search(np.array([query_emb]).astype("float32"), k)
16
+ results = []
17
+ for idx in I[0]:
18
+ if idx >= 0:
19
+ results.append(self.docs[idx]['section'])
20
+ return results