YvesP commited on
Commit
65642c3
·
1 Parent(s): 32b5894

initial load

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 81/generated_text.txt +7 -0
  2. README.md +4 -4
  3. __pycache__/test_input_file.cpython-311-pytest-7.3.1.pyc +0 -0
  4. app.py +374 -0
  5. data/list +20 -0
  6. data/long_example.txt +21 -0
  7. data/onetask_example.txt +7 -0
  8. data/usage.txt +13 -0
  9. requirements.txt +111 -0
  10. src/control/__pycache__/control.cpython-311.pyc +0 -0
  11. src/control/control.py +93 -0
  12. src/model/__pycache__/document.cpython-311.pyc +0 -0
  13. src/model/document.py +176 -0
  14. src/model/model.py +60 -0
  15. src/tools/__pycache__/llm_tools.cpython-311.pyc +0 -0
  16. src/tools/__pycache__/llms.cpython-311.pyc +0 -0
  17. src/tools/__pycache__/semantic_db.cpython-311.pyc +0 -0
  18. src/tools/__pycache__/wiki.cpython-311.pyc +0 -0
  19. src/tools/llm_tools.py +207 -0
  20. src/tools/llms.py +20 -0
  21. src/tools/semantic_db.py +60 -0
  22. src/tools/wiki.py +61 -0
  23. tests/.chroma/index/id_to_uuid_0c55a091-9f95-4a8d-b868-83d95412fdc4.pkl +3 -0
  24. tests/.chroma/index/id_to_uuid_31dac11a-6e77-49ca-a1b5-fce9e3fe275a.pkl +3 -0
  25. tests/.chroma/index/id_to_uuid_37825327-eef6-4255-92ac-787c21197d77.pkl +3 -0
  26. tests/.chroma/index/id_to_uuid_40ba1a00-ce47-4e51-a2d3-56eb96ecb82b.pkl +3 -0
  27. tests/.chroma/index/id_to_uuid_46204504-325f-47e6-9176-e2054080ad57.pkl +3 -0
  28. tests/.chroma/index/id_to_uuid_64afc7c0-c153-47d1-af52-55e1738ae76c.pkl +3 -0
  29. tests/.chroma/index/id_to_uuid_69550299-be81-45fa-8bbf-3d83be2d7991.pkl +3 -0
  30. tests/.chroma/index/id_to_uuid_78f80853-f999-4f5e-b320-41c98bd28592.pkl +3 -0
  31. tests/.chroma/index/id_to_uuid_90d6076c-bb50-40ed-90a1-2df2243fd12e.pkl +3 -0
  32. tests/.chroma/index/id_to_uuid_a10bf13e-424a-41cd-bcfb-27d8072711ea.pkl +3 -0
  33. tests/.chroma/index/id_to_uuid_a6f9bfcf-0593-40b1-a282-a54d5b75d939.pkl +3 -0
  34. tests/.chroma/index/id_to_uuid_aba244c9-042f-42a3-860c-a68e1ee0b4a5.pkl +3 -0
  35. tests/.chroma/index/id_to_uuid_afc3d29f-a033-4bcf-9ef4-e93b6211ac95.pkl +3 -0
  36. tests/.chroma/index/id_to_uuid_b5e184d4-5839-4b0b-9bd8-638fa6bc080a.pkl +3 -0
  37. tests/.chroma/index/id_to_uuid_bcb0093e-68dd-4d75-a758-63ef7a681d92.pkl +3 -0
  38. tests/.chroma/index/id_to_uuid_bf57b36f-a918-4484-b897-79f751d5cad4.pkl +3 -0
  39. tests/.chroma/index/id_to_uuid_e208b245-d2cd-4069-9a8c-d5f010d91afb.pkl +3 -0
  40. tests/.chroma/index/id_to_uuid_f09229bd-8639-49e8-8a84-8e6e0aa11971.pkl +3 -0
  41. tests/.chroma/index/index_0c55a091-9f95-4a8d-b868-83d95412fdc4.bin +3 -0
  42. tests/.chroma/index/index_31dac11a-6e77-49ca-a1b5-fce9e3fe275a.bin +3 -0
  43. tests/.chroma/index/index_37825327-eef6-4255-92ac-787c21197d77.bin +3 -0
  44. tests/.chroma/index/index_40ba1a00-ce47-4e51-a2d3-56eb96ecb82b.bin +3 -0
  45. tests/.chroma/index/index_46204504-325f-47e6-9176-e2054080ad57.bin +3 -0
  46. tests/.chroma/index/index_64afc7c0-c153-47d1-af52-55e1738ae76c.bin +3 -0
  47. tests/.chroma/index/index_69550299-be81-45fa-8bbf-3d83be2d7991.bin +3 -0
  48. tests/.chroma/index/index_78f80853-f999-4f5e-b320-41c98bd28592.bin +3 -0
  49. tests/.chroma/index/index_90d6076c-bb50-40ed-90a1-2df2243fd12e.bin +3 -0
  50. tests/.chroma/index/index_a10bf13e-424a-41cd-bcfb-27d8072711ea.bin +3 -0
81/generated_text.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ !! terrorism in Italy in the years 70 and 80
2
+
3
+ # what happened
4
+
5
+ ## the killing of Aldo Moro
6
+ Aldo Moro was an Italian statesman and a prominent member of the Christian Democracy party. He was kidnapped and killed by the Red Brigades, a left-wing terrorist group, in 1978. The Red Brigades wanted to destabilize the Italian government and force the release of their imprisoned members. They believed that Moro was the key to achieving their goals, and so they targeted him for assassination. Moro's death was a major blow to the Italian government and to the Christian Democracy party, and it marked the beginning of a period of increased terrorism in Italy. The Red Brigades were eventually disbanded, but the legacy of Moro's death still lingers in Italy today.
7
+
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: GPTdoc
3
- emoji: 🌍
4
- colorFrom: purple
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 3.32.0
8
  app_file: app.py
 
1
  ---
2
+ title: Gendoc
3
+ emoji: 🦀
4
+ colorFrom: indigo
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.32.0
8
  app_file: app.py
__pycache__/test_input_file.cpython-311-pytest-7.3.1.pyc ADDED
Binary file (7.48 kB). View file
 
app.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+ import gradio as gr
3
+ import numpy as np
4
+ import asyncio
5
+ import shutil
6
+
7
+ import src.control.control as control
8
+
9
+
10
+ """
11
+ ==================================
12
+ A. Component part
13
+ ==================================
14
+ """
15
+
16
+ with gr.Blocks() as docgpt:
17
+ with gr.Row():
18
+
19
+ with gr.Column():
20
+ pass
21
+
22
+ with gr.Column(scale=10):
23
+ """
24
+ 1. input docs components
25
+ """
26
+
27
+ gr.Markdown("# 1. Define the plan of your document")
28
+
29
+ f = open('data/usage.txt', 'r')
30
+ usage = f.read()
31
+
32
+ input_text = gr.Textbox(
33
+ label="enter your text",
34
+ lines=25,
35
+ max_lines=25,
36
+ interactive=True,
37
+ elem_classes="selected_",
38
+ placeholder=usage,
39
+ )
40
+
41
+ with gr.Row():
42
+ upload_btn = gr.UploadButton(type='file')
43
+ example1 = gr.Button("One task example")
44
+ example2 = gr.Button("Several tasks example")
45
+
46
+ """
47
+ 2. source components
48
+ """
49
+
50
+ gr.Markdown("# 2. Choose the sources for the document generation")
51
+
52
+ with gr.Column(visible=True, variant='panel') as select_col:
53
+ gr.Markdown("### Select the sources")
54
+ source_radio = gr.Radio(
55
+ choices=["Unknown sources", "My own sources"],
56
+ label="",
57
+ value="Unknown sources",
58
+ visible=True,
59
+ )
60
+
61
+ with gr.Column(visible=False, variant='panel') as db_col:
62
+ gr.Markdown("### My sources")
63
+ db_list_comp = gr.CheckboxGroup(
64
+ label="Current content",
65
+ info="These documents are currently your sources. Unselect the documents you don't want to be taken"
66
+ "into account when generating the document",
67
+ visible=True,
68
+ interactive=True,
69
+ )
70
+ with gr.Row():
71
+ db_reset_btn = gr.Button("Reset the sources", visible=False).style(full_width=False, size="sm")
72
+ db_add_doc_btn = gr.Button("Add new documents", visible=True).style(full_width=False, size="sm")
73
+
74
+ with gr.Column(visible=False, variant="panel") as add_col:
75
+ gr.Markdown("### Add new documents ")
76
+
77
+ with gr.Tab("From Wikipedia"):
78
+ wiki_fetch_btn = gr.Button("Search for Wikipedia pages", visible=True)
79
+ wiki_fetch_btn.style(full_width=False, size="sm")
80
+ wiki_list_comp = gr.CheckboxGroup(
81
+ label="Select the wiki pages",
82
+ info="The selected pages can be added to sources",
83
+ visible=False,
84
+ interactive=True,
85
+ )
86
+
87
+ wiki_add_to_db_btn = gr.Button("Add selection to sources", visible=False)
88
+ wiki_add_to_db_btn.style(full_width=False, size="sm")
89
+
90
+ with gr.Tab("From disk"):
91
+ my_files_list_comp = gr.Files(
92
+ label="Upload own documents",
93
+ info="Your selected documents provide the content for generating the output document",
94
+ visible=True,
95
+ )
96
+ my_files_add_to_db_btn = gr.Button("Add files to sources", visible=False)
97
+ my_files_add_to_db_btn.style(full_width=False, size="sm")
98
+
99
+ add_close_btn = gr.Button("Close").style(size='sm', full_width=False)
100
+
101
+ """
102
+ 3. Generate (and inspect the document)
103
+ """
104
+
105
+ gr.Markdown("# 3. Generate the document")
106
+
107
+ generate_btn = gr.Button("Generate", interactive=True)
108
+
109
+ output_text = gr.Textbox(
110
+ label="Generated document",
111
+ value="",
112
+ lines=25,
113
+ max_lines=25,
114
+ interactive=False,
115
+ )
116
+
117
+ generated_file = gr.File(
118
+ interactive=False,
119
+ visible=False,
120
+ )
121
+
122
+ with gr.Column():
123
+ pass
124
+
125
+ """
126
+ ==================================
127
+ B. Logic part
128
+ ==================================
129
+ """
130
+
131
+ """
132
+ B.1 Input text
133
+ """
134
+ def upload_input_file(file_):
135
+ return upload_file(file_.name)
136
+
137
+
138
+ def upload_example_file(btn, input_id_):
139
+ filename = "onetask_example.txt" if btn == "One task example" else "long_example.txt"
140
+ long_id = control.get_long_id(input_id_)
141
+ os.mkdir('tmp_input/' + long_id)
142
+ copypath = 'tmp_input/' + long_id + '/' + filename
143
+ shutil.copy("data/" + filename, copypath)
144
+ update_ = upload_file(copypath)
145
+ update_[input_id] = gr.update(value=long_id)
146
+ return update_
147
+
148
+
149
+ def upload_file(filename):
150
+ f_ = open(filename, "r")
151
+ input_text_ = f_.read()
152
+ update_ = {
153
+ input_text: gr.update(value=input_text_)
154
+ }
155
+ return update_
156
+
157
+
158
+ input_id = gr.State(-1)
159
+
160
+ upload_btn.upload(upload_input_file, inputs=[upload_btn], outputs=[input_text])
161
+ example1.click(upload_example_file, inputs=[example1, input_id], outputs=[input_text, input_id])
162
+ example2.click(upload_example_file, inputs=[example2, input_id], outputs=[input_text, input_id])
163
+
164
+ """
165
+ --------------------
166
+ B.2 Logic for sources
167
+ --------------------
168
+ """
169
+
170
+
171
+ def source_fn(source_, db_collection_):
172
+ """
173
+ Allows to choose the sources for the doc generation
174
+ """
175
+ if source_ == "My own sources":
176
+ long_id = control.get_long_id(db_collection_)
177
+ control.get_or_create_collection(long_id)
178
+ update_ = {
179
+ db_col: gr.update(visible=True),
180
+ db_collection_var: long_id,
181
+ }
182
+ else:
183
+ update_ = {
184
+ db_col: gr.update(visible=False),
185
+ }
186
+ return update_
187
+
188
+
189
+ def db_reset_fn(wiki_source_, db_collection_):
190
+ """
191
+ resets the source db
192
+ """
193
+ coll = control.get_or_create_collection(db_collection_)
194
+ control.reset_collection(coll)
195
+ wiki_to_add_not_empty = 0 < len(wiki_source_)
196
+ update_ = {
197
+ wiki_db_var: [],
198
+ my_files_db_var: [],
199
+ db_reset_btn: gr.update(visible=False),
200
+ db_list_comp: gr.update(value=[], choices=[]),
201
+ wiki_list_comp: gr.update(value=wiki_source_, choices=wiki_source_),
202
+ wiki_add_to_db_btn: gr.update(visible=wiki_to_add_not_empty),
203
+ }
204
+ return update_
205
+
206
+
207
+ def db_add_doc_fn():
208
+ """
209
+ opens the component which allows to add new own files or wiki to the source db
210
+ """
211
+ update_ = {
212
+ db_add_doc_btn: gr.update(visible=False),
213
+ add_col: gr.update(visible=True),
214
+ }
215
+ return update_
216
+
217
+
218
+ def add_close_fn():
219
+ """
220
+ close the component which allows to add new own files or wiki to the source db
221
+ """
222
+ update_ = {
223
+ db_add_doc_btn: gr.update(visible=True),
224
+ add_col: gr.update(visible=False),
225
+ }
226
+ return update_
227
+
228
+
229
+ def wiki_fetch_fn(wiki_db_files_, input_text_):
230
+ """
231
+ fetch the wikifiles interesting for solving the tasks as defined in the input doc
232
+ """
233
+ wiki_interesting_files = control.wiki_fetch(input_text_)
234
+ wiki_files = [wiki for wiki in wiki_interesting_files if wiki not in wiki_db_files_]
235
+ update_ = {
236
+ wiki_list_comp: gr.update(visible=True, value=wiki_files, choices=wiki_files),
237
+ wiki_add_to_db_btn: gr.update(visible=True),
238
+ wiki_source_var: wiki_interesting_files,
239
+ }
240
+ return update_
241
+
242
+
243
+ async def wiki_add_to_db_fn(wiki_list_, wiki_source_, wiki_db_, db_list_, db_collection_):
244
+ """
245
+ adds the wikipages to the db source
246
+ """
247
+ wiki_to_add = [wiki for wiki in wiki_list_ if wiki not in wiki_db_]
248
+ db_list_ += wiki_to_add
249
+ wiki_db_ += wiki_to_add
250
+ wiki_source_remaining = [wiki for wiki in wiki_source_ if wiki not in wiki_db_]
251
+ tasks = [control.wiki_upload_and_store(wiki, db_collection_) for wiki in wiki_to_add]
252
+ await asyncio.gather(*tasks)
253
+ db_not_empty = 0 < len(db_list_)
254
+ wiki_to_add_not_empty = 0 < len(wiki_source_remaining)
255
+ update_ = {
256
+ wiki_db_var: wiki_db_,
257
+ wiki_list_comp: gr.update(value=wiki_source_remaining, choices=wiki_source_remaining),
258
+ wiki_add_to_db_btn: gr.update(visible=wiki_to_add_not_empty),
259
+ db_list_comp: gr.update(
260
+ visible=True,
261
+ value=db_list_,
262
+ choices=db_list_,
263
+ label="Database content"),
264
+ db_reset_btn: gr.update(visible=db_not_empty),
265
+ generate_btn: gr.update(visible=True, interactive=db_not_empty),
266
+ }
267
+ return update_
268
+
269
+
270
+ def my_files_list_fn(my_files_list_):
271
+
272
+ update_ = {
273
+ my_files_add_to_db_btn: gr.update(visible=bool(my_files_list_))
274
+ }
275
+ return update_
276
+
277
+
278
+ async def my_files_add_to_db_fn(my_files_list_, my_files_db_, db_list_):
279
+ """
280
+ adds the files to the db source
281
+ """
282
+ my_files_to_add = [fi.name for fi in my_files_list_ if fi.name not in my_files_db_]
283
+ tasks = [control.my_files_upload_and_store(f_name) for f_name in my_files_to_add]
284
+ await asyncio.gather(*tasks)
285
+ my_files_to_add = [os.path.basename(f_name) for f_name in my_files_to_add]
286
+ my_files_db_ += my_files_to_add
287
+ db_list_ += my_files_to_add
288
+ update_ = {
289
+ my_files_list_comp: gr.update(value=None),
290
+ my_files_add_to_db_btn: gr.update(visible=False),
291
+ my_files_db_var: gr.update(value=my_files_db_),
292
+ generate_btn: gr.update(interactive=True),
293
+ db_reset_btn: gr.update(visible=True),
294
+ db_list_comp: gr.update(
295
+ visible=True,
296
+ value=db_list_,
297
+ choices=db_list_,
298
+ label="Database content"),
299
+ }
300
+ return update_
301
+
302
+
303
+ wiki_source_var: [str] = gr.State([]) # list of wikipage titles of interest for the input text tasks
304
+ wiki_db_var: [str] = gr.State([]) # list of wiki document titles in the db (as seen from the UI)
305
+ my_files_db_var: [str] = gr.State([]) # list of titles of the files uploaded in the db (as seen from the UI)
306
+ db_collection_var: str = gr.State(-1) # name of the collection of documents sources in the db
307
+
308
+ source_radio.change(source_fn, inputs=[source_radio, db_collection_var], outputs=[db_col, db_collection_var])
309
+ db_add_doc_btn.click(db_add_doc_fn, inputs=[], outputs=[db_add_doc_btn, add_col])
310
+ add_close_btn.click(add_close_fn, inputs=[], outputs=[db_add_doc_btn, add_col])
311
+
312
+ wiki_fetch_btn.click(wiki_fetch_fn,
313
+ inputs=[wiki_db_var, input_text],
314
+ outputs=[wiki_list_comp, wiki_source_var, wiki_add_to_db_btn])
315
+ wiki_add_to_db_btn.click(wiki_add_to_db_fn,
316
+ inputs=[wiki_list_comp, wiki_source_var, wiki_db_var, db_list_comp, db_collection_var],
317
+ outputs=[db_list_comp, wiki_list_comp, wiki_db_var,
318
+ generate_btn, wiki_add_to_db_btn, db_reset_btn])
319
+
320
+ my_files_list_comp.change(my_files_list_fn, inputs=[my_files_list_comp], outputs=[my_files_add_to_db_btn])
321
+ my_files_add_to_db_btn.click(my_files_add_to_db_fn,
322
+ inputs=[my_files_list_comp, my_files_db_var, db_list_comp],
323
+ outputs=[my_files_add_to_db_btn, my_files_list_comp, my_files_db_var,
324
+ db_reset_btn, generate_btn, db_list_comp])
325
+ db_reset_btn.click(db_reset_fn,
326
+ inputs=[wiki_source_var, db_collection_var],
327
+ outputs=[wiki_db_var, my_files_db_var, db_list_comp, db_reset_btn,
328
+ db_add_doc_btn, wiki_list_comp, wiki_add_to_db_btn])
329
+
330
+ """
331
+ --------------------
332
+ B.3 Logic for generation
333
+ --------------------
334
+ """
335
+
336
+
337
+ def generate_fn(input_text_, source_, db_collection_, db_list_):
338
+ """
339
+ generates the final text starting from the input text and the source : either "public" or private = from
340
+ documents stored in the collection in the db
341
+ """
342
+ rand_dir_path = "./" + str(np.random.randint(1000))
343
+ os.mkdir(rand_dir_path)
344
+ fpath = rand_dir_path + "/generated_text.txt"
345
+ f_ = open(fpath, "w")
346
+
347
+ if source_ == "Unknown sources":
348
+ output_text_ = control.generate_doc_from_gpt(input_text_)
349
+ else:
350
+ coll = db_collection_
351
+ output_text_ = control.generate_doc_from_db(input_txt=input_text_,
352
+ collection_name=coll,
353
+ from_files=db_list_)
354
+ f_.write(output_text_)
355
+ f_.seek(0)
356
+
357
+ update_ = {
358
+ output_text: gr.update(value=output_text_),
359
+ generated_file: gr.update(visible=True, value=f_.name),
360
+ }
361
+ return update_
362
+
363
+
364
+ generate_btn.click(generate_fn,
365
+ inputs=[input_text, source_radio, db_collection_var, db_list_comp],
366
+ outputs=[output_text, generated_file])
367
+
368
+ """
369
+ ==================================
370
+ Launch
371
+ ==================================
372
+ """
373
+
374
+ docgpt.queue().launch()
data/list ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "Years of Lead (Italy)",
3
+ "Terrorism in Italy",
4
+ "Red Brigades",
5
+ "Ordine Nuovo",
6
+ "Years of Lead (Italy)",
7
+ "Cold War",
8
+ "Terrorism in Europe",
9
+ "Palestinian terrorism"
10
+ ]
11
+
12
+ prompt = f"""
13
+ Your task is to identify the title of relevant wikipedia pages which would be helpful \
14
+ to expand on this text.
15
+
16
+ Give the page titles in the form of a JSON list, the text is delimited by triple \
17
+ backticks.
18
+
19
+ Text: ```{text}```
20
+ """
data/long_example.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !! terrorism in Italy in the years 70 and 80
2
+
3
+ # what happened
4
+ ++ describe the facts in Italy
5
+ It is a fact , that Italy undergone several acts of terrorism in the 70s ans 80s
6
+
7
+ ## summary of all events linked to terrorism
8
+ ?? summarize terrorism events from 70 to 90 in Italy (around 100 words)
9
+
10
+ ## the major events
11
+ ?? identify several events and describe no more than 5 events (around 50 words per event)
12
+ ## the major organisations
13
+ ?? identify major organisations (political parties, terrorists groups, etc.) and key individuals
14
+ # the global context
15
+ ++ give some context outside of Italy
16
+ ## a specific period during the cold war between USSR and the USA
17
+ ?? describe the specificities of the relationship between the US and USSR (around 100 words)
18
+ ## the terrorism in the 70-80s in Europe
19
+ ?? identify terrorism facts in the rest of Europe (e.g. Germany, France, Belgium) (around 50 words per fact)
20
+ ## Palestinian terrorism in the 70s 80s
21
+ ?? give some infos on Palestinian terrorism facts: acts, organisation and key individuals (around 100 words)
data/onetask_example.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ !! terrorism in Italy in the years 70 and 80
2
+
3
+ # what happened
4
+
5
+ ## the killing of Aldo Moro
6
+ ?? who killed Aldo Moro and why? (around 100 words)
7
+
data/usage.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !! Title
2
+
3
+ # Heading level 1
4
+ ## Heading level 2
5
+ ### Heading level 3
6
+ #### and so on ...
7
+
8
+ ?? Description of the paragraph to be generated
9
+
10
+ ++ Comment: adds additional context for the text generator
11
+
12
+ normal text: it is taken into account by the text generator but remains as is in the generated document
13
+
requirements.txt ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
+ altair==4.2.2
5
+ anyio==3.6.2
6
+ async-timeout==4.0.2
7
+ attrs==23.1.0
8
+ backoff==2.2.1
9
+ beautifulsoup4==4.12.2
10
+ cachetools==5.3.0
11
+ certifi==2022.12.7
12
+ charset-normalizer==3.1.0
13
+ chromadb==0.3.21
14
+ click==8.1.3
15
+ clickhouse-connect==0.5.20
16
+ contourpy==1.0.7
17
+ cycler==0.11.0
18
+ dataclasses-json==0.5.7
19
+ duckdb==0.7.1
20
+ entrypoints==0.4
21
+ fastapi==0.95.1
22
+ ffmpy==0.3.0
23
+ filelock==3.11.0
24
+ fonttools==4.39.3
25
+ frozenlist==1.3.3
26
+ fsspec==2023.4.0
27
+ google-search-results==2.4.2
28
+ gptcache==0.1.12
29
+ gradio==3.27.0
30
+ gradio_client==0.1.3
31
+ h11==0.14.0
32
+ hnswlib==0.7.0
33
+ httpcore==0.17.0
34
+ httptools==0.5.0
35
+ httpx==0.24.0
36
+ huggingface-hub==0.13.4
37
+ idna==3.4
38
+ iniconfig==2.0.0
39
+ Jinja2==3.1.2
40
+ joblib==1.2.0
41
+ jsonschema==4.17.3
42
+ kiwisolver==1.4.4
43
+ langchain==0.0.141
44
+ linkify-it-py==2.0.0
45
+ lz4==4.3.2
46
+ markdown-it-py==2.2.0
47
+ MarkupSafe==2.1.2
48
+ marshmallow==3.19.0
49
+ marshmallow-enum==1.5.1
50
+ matplotlib==3.7.1
51
+ mdit-py-plugins==0.3.3
52
+ mdurl==0.1.2
53
+ monotonic==1.6
54
+ mpmath==1.3.0
55
+ multidict==6.0.4
56
+ mypy-extensions==1.0.0
57
+ networkx==3.1
58
+ nltk==3.8.1
59
+ numpy==1.24.2
60
+ openai==0.27.4
61
+ openapi-schema-pydantic==1.2.4
62
+ orjson==3.8.10
63
+ packaging==23.1
64
+ pandas==2.0.0
65
+ Pillow==9.5.0
66
+ pluggy==1.0.0
67
+ posthog==3.0.0
68
+ pydantic==1.10.7
69
+ pydub==0.25.1
70
+ pyparsing==3.0.9
71
+ pyrsistent==0.19.3
72
+ pytest==7.3.1
73
+ python-dateutil==2.8.2
74
+ python-dotenv==1.0.0
75
+ python-multipart==0.0.6
76
+ pytz==2023.3
77
+ PyYAML==6.0
78
+ regex==2023.3.23
79
+ requests==2.28.2
80
+ scikit-learn==1.2.2
81
+ scipy==1.10.1
82
+ semantic-version==2.10.0
83
+ sentence-transformers==2.2.2
84
+ sentencepiece==0.1.98
85
+ six==1.16.0
86
+ sniffio==1.3.0
87
+ soupsieve==2.4.1
88
+ SQLAlchemy==1.4.47
89
+ starlette==0.26.1
90
+ sympy==1.11.1
91
+ tenacity==8.2.2
92
+ threadpoolctl==3.1.0
93
+ tokenizers==0.13.3
94
+ toolz==0.12.0
95
+ torch==2.0.0
96
+ torchvision==0.15.1
97
+ tqdm==4.65.0
98
+ transformers==4.28.1
99
+ typing-inspect==0.8.0
100
+ typing_extensions==4.5.0
101
+ tzdata==2023.3
102
+ uc-micro-py==1.0.1
103
+ urllib3==1.26.15
104
+ uvicorn==0.21.1
105
+ uvloop==0.17.0
106
+ watchfiles==0.19.0
107
+ websockets==11.0.2
108
+ wget==3.2
109
+ wikipedia==1.4.0
110
+ yarl==1.8.2
111
+ zstandard==0.21.0
src/control/__pycache__/control.cpython-311.pyc ADDED
Binary file (5.91 kB). View file
 
src/control/control.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import asyncio # on va en avoir besoin :)
3
+ import string
4
+ import random
5
+ from datetime import datetime
6
+
7
+
8
+ from src.tools.semantic_db import get_or_create_collection, reset_collection
9
+ from src.tools.wiki import Wiki
10
+ from src.model.document import InputDoc, WikiPage
11
+ from src.tools.llm_tools import get_wikilist, get_public_paragraph, get_private_paragraph
12
+ from src.tools.semantic_db import add_texts_to_collection, query_collection
13
+
14
+ """
15
+ Tools
16
+ """
17
+
18
+
19
+ def get_long_id(id_):
20
+ if id_ != -1:
21
+ return id_
22
+ else:
23
+ now = datetime.now().strftime("%m%d%H%M")
24
+ letters = string.ascii_lowercase + string.digits
25
+ long_id = now+'-'+''.join(random.choice(letters) for _ in range(10))
26
+ return long_id
27
+
28
+
29
+ """
30
+ Input control
31
+ """
32
+
33
+
34
+
35
+ """
36
+ Source Control
37
+ """
38
+
39
+ def wiki_fetch(input_text: str) -> [str]:
40
+ """
41
+ returns the title of the wikipages corresponding to the tasks described in the input text
42
+ """
43
+ tasks = InputDoc(input_text).tasks
44
+ wiki_lists = [get_wikilist(t) for t in tasks]
45
+ flatten_wiki_list = list(set().union(*[set(w) for w in wiki_lists]))
46
+ return flatten_wiki_list
47
+
48
+
49
+ async def wiki_upload_and_store(wiki_title: str, collection_name: str):
50
+ """
51
+ uploads one wikipage and stores them into the right collection
52
+ """
53
+ wikipage = Wiki().fetch(wiki_title)
54
+ wiki_title = wiki_title
55
+ if type(wikipage) != str:
56
+ texts = WikiPage(wikipage.page_content).get_paragraphs()
57
+ add_texts_to_collection(coll_name=collection_name, texts=texts, file=wiki_title, source='wiki')
58
+ else:
59
+ print(wikipage)
60
+
61
+
62
+ async def my_files_upload_and_store(title: str, collection_name: str):
63
+ doc = title
64
+ title = title
65
+ texts = InputDoc(doc).get_paragraphs()
66
+ add_texts_to_collection(coll_name=collection_name, texts=texts, file=title, source='my_files')
67
+
68
+
69
+ """
70
+ Generate Control
71
+ """
72
+
73
+
74
+ def generate_doc_from_gpt(input_txt: str) -> str:
75
+ input_doc = InputDoc(input_txt)
76
+ tasks = input_doc.tasks
77
+ task_resolutions = [get_public_paragraph(t) for t in tasks]
78
+ # task_resolutions = ["ça c'est de la réso"]
79
+ generated_doc = input_doc.replace_tasks(task_resolutions)
80
+ return generated_doc
81
+
82
+
83
+ def generate_doc_from_db(input_txt: str, collection_name: str, from_files: [str]) -> str:
84
+
85
+ def query_from_task(task):
86
+ return get_public_paragraph(task)
87
+ input_doc = InputDoc(input_txt)
88
+ tasks = input_doc.tasks
89
+ queries = [query_from_task(t) for t in tasks]
90
+ texts_list = [query_collection(coll_name=collection_name, query=q, from_files=from_files) for q in queries]
91
+ task_resolutions = [get_private_paragraph(task=task, texts=texts) for task, texts in zip(tasks, texts_list)]
92
+ generated_doc = input_doc.replace_tasks(task_resolutions)
93
+ return generated_doc
src/model/__pycache__/document.cpython-311.pyc ADDED
Binary file (12.6 kB). View file
 
src/model/document.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Doc:
2
+ def __init__(self, fulltext: str = '', title: str = '', params: dict = {}):
3
+ self.params = params
4
+ self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()]
5
+ self.title, self.lines = self._get_title(title)
6
+ self.container = Container(lines=self.lines, title=self.title, father=self, params=params)
7
+ self.tasks = [c.get_task(self.container.one_liner) for c in self.container.containers if c.task]
8
+ self.fulltext = fulltext
9
+
10
+ def _get_title(self, title):
11
+ lines = self.lines
12
+ if self.params['type'] == 'input_text':
13
+ if self.lines and self.lines[0] and self.lines[0].type == 'title':
14
+ title = self.lines[0].text
15
+ lines = lines[1:]
16
+ else:
17
+ title = 'the title is missing'
18
+ return title, lines
19
+
20
+ def replace_tasks(self, resolutions: [str]):
21
+ starts = self.params['startswith_']
22
+ reverts = {starts[k]: k for k in starts}
23
+ task_starter = reverts['task']
24
+ lines = self.fulltext.split('\n')
25
+ new_lines = [line if not line.startswith(task_starter) else next(iter(resolutions)) for line in lines]
26
+ new_fulltext = "\n".join(new_lines)
27
+ return new_fulltext
28
+
29
+
30
+ class InputDoc(Doc):
31
+
32
+ def __init__(self, fulltext='', title=''):
33
+ self.params = {
34
+ 'type': 'input_text',
35
+ 'startswith_':
36
+ {'!!': 'title', '++': 'comment', '??': 'task',
37
+ '# ': '1', '## ': '2', '### ': '3', '####': '4', '#####': '5', '######': '6'}
38
+ }
39
+ super().__init__(fulltext=fulltext, title=title, params=self.params)
40
+
41
+
42
+ class WikiPage(Doc):
43
+
44
+ def __init__(self, fulltext='', title=''):
45
+ self.params = {
46
+ 'type': 'wiki',
47
+ 'startswith_':
48
+ {'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'},
49
+ 'endswith_':
50
+ [' ==', ' ===', ' ====', ' =====', ' ======', ' ======'],
51
+
52
+ 'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography",
53
+ "Cinematic adaptations", "Further reading", "Maps"]
54
+ }
55
+ super().__init__(fulltext=fulltext, title=title, params=self.params)
56
+
57
+ def get_paragraphs(self, chunk=500):
58
+ return self.container.get_paragraphs(chunk)
59
+
60
+
61
+ class Container:
62
+
63
+ def __init__(self, lines=[], level=0, title='', father=None, params={}):
64
+
65
+ self.normals = []
66
+ self.normal = ''
67
+ self.comments = []
68
+ self.comment = ''
69
+ self.tasks = []
70
+ self.task = ''
71
+ self.children = []
72
+ self.level = level
73
+ self.title = title
74
+ self.father = father
75
+
76
+ self._expand(lines)
77
+
78
+ if params and 'discarded' in params.keys():
79
+ self.children = [child for child in self.children if child.title not in params['discarded']]
80
+
81
+ self.containers = [self]
82
+ for child in self.children:
83
+ self.containers += child.containers
84
+ self.one_liner = self.title + ' ' + self.comment
85
+ self.root_text = self.one_liner + ' ' + self.normal
86
+ self.text = self.root_text
87
+ for child in self.children:
88
+ self.text += ' ' + child.text
89
+
90
+ self.summary = self.text
91
+
92
+ def _expand(self, lines):
93
+ new_child = False
94
+ new_child_lines = []
95
+ new_child_title = []
96
+ for line in lines:
97
+ if not new_child:
98
+ if line.type == 'normal':
99
+ self.normals.append(line)
100
+ self.normal += ' ' + line.text
101
+ elif line.type == 'comment':
102
+ self.comments.append(line)
103
+ self.comment += ' ' + line.text
104
+ elif line.type == 'task':
105
+ self.tasks.append(line)
106
+ self.task += ' ' + line.text
107
+ elif line.is_structure:
108
+ new_child = True
109
+ new_child_lines = []
110
+ new_child_title = line.text
111
+ line.level = self.level + 1
112
+ self.one_liner = self.title + self.comment
113
+ else:
114
+ if self.level + 1 < line.level or not line.is_structure:
115
+ new_child_lines.append(line)
116
+ elif self.level + 1 == line.level:
117
+ self.children.append(Container(lines=new_child_lines,
118
+ level=self.level + 1,
119
+ title=new_child_title,
120
+ father=self))
121
+ new_child_lines = []
122
+ new_child_title = line.text
123
+ if new_child:
124
+ self.children.append(Container(lines=new_child_lines,
125
+ level=self.level + 1,
126
+ title=new_child_title,
127
+ father=self))
128
+
129
+ def get_task(self, doc_one_liner):
130
+ siblings_ = self.father.children.copy()
131
+ index = siblings_.index(self)
132
+ siblings_before_context = [sibling.one_liner for idx, sibling in enumerate(siblings_) if idx < index]
133
+ siblings_after_context = [sibling.one_liner for idx, sibling in enumerate(siblings_) if index < idx]
134
+
135
+ task = {'description': self.task,
136
+ 'about': self.one_liner,
137
+ 'doc_description': doc_one_liner,
138
+ 'above': self.father.one_liner,
139
+ 'before': siblings_before_context,
140
+ 'after': siblings_after_context}
141
+ return task
142
+
143
+ def get_paragraphs(self, chunk=500):
144
+ if len(self.text) < chunk:
145
+ paragraphs = [self.text]
146
+ else:
147
+ paragraphs = [self.root_text]
148
+ for child in self.children:
149
+ paragraphs += child.get_paragraphs(chunk)
150
+ return paragraphs
151
+
152
+
153
+ class Line:
154
+
155
+ def __init__(self, text, params):
156
+ self.text = text
157
+ self.type, self.text = self._parse_text(params)
158
+ self.level = int(self.type) if self.type.isdigit() else -1
159
+ self.is_structure = 0 < self.level
160
+
161
+ def _parse_text(self, params):
162
+ def strip_text(text_, start, end):
163
+ text_ = text_.split(start)[1]
164
+ if end != "":
165
+ text_ = text_.split(end)[0]
166
+ # text += ". \n"
167
+ return text_.strip()
168
+
169
+ startswith_ = params['startswith_']
170
+
171
+ endswith_ = params['endswith_'] if 'endswith_' in params.keys() else [""] * len(startswith_)
172
+ types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter])
173
+ for i, starter in enumerate(startswith_.keys())
174
+ if self.text.startswith(starter)]
175
+ (text, type_) = types[0] if types else (self.text, 'normal')
176
+ return type_, text.strip()
src/model/model.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ class Container:
5
+
6
+ def __init__(self, title: str = '', fulltext: str = '', level: int = 0):
7
+
8
+ self.title = title
9
+ self.fulltext = fulltext
10
+ self.children = []
11
+ self.text = ''
12
+ self.level = level
13
+ self.docs = []
14
+ self.expand()
15
+ self.to_docs()
16
+
17
+ def expand(self, max_length=700):
18
+
19
+ if 0 < self.level:
20
+ split_title = self.fulltext.split(Container.title_separators[self.level])
21
+ if 1 < len(split_title):
22
+ self.title += ('\n' + re.sub(Container.title_headers[self.level], '', split_title[0]))
23
+ self.fulltext = split_title[1]
24
+ if self.title in Container.discarded:
25
+ self.fulltext = self.text = ''
26
+ if self.fulltext:
27
+ if max_length < len(self.fulltext):
28
+ split_text = self.fulltext.split(Container.separators[self.level])
29
+ if self.fulltext[0] != '=':
30
+ self.text += self.title + '\n' + split_text[0]
31
+ split_text.pop(0)
32
+ self.children = [Container(fulltext=t, level=self.level + 1, title=self.title) for t in split_text]
33
+ else:
34
+ self.text += '\n' + self.fulltext
35
+
36
+ def to_docs(self):
37
+ self.docs = [self.text] if 60 < len(self.text) else []
38
+ for child in self.children:
39
+ self.docs += child.root_text
40
+
41
+ def group_docs(self, max_length=700):
42
+ grouped_docs = []
43
+ for doc in self.docs:
44
+ if grouped_docs and len(grouped_docs[-1])+len(doc) < max_length:
45
+ doc = grouped_docs.pop()+' '+doc
46
+ grouped_docs.append(doc)
47
+ return grouped_docs
48
+
49
+ def __str__(self):
50
+ card = "... level : " + str(self.level) + " words :" + str(len(self.text.split(' '))) + "\n"
51
+ card += "... title : " + self.title[:100] + "\n"
52
+ card += "... text : " + self.text[:100] + "\n"
53
+ card += "... fulllength : " + str(len(self.fulltext)) + "\n"
54
+ card += "... length : " + str(len(self.text)) + "\n\n"
55
+ for child in self.children:
56
+ card += child.__str__()
57
+ return card
58
+
59
+ def get_texts(self):
60
+ return self.group_docs()
src/tools/__pycache__/llm_tools.cpython-311.pyc ADDED
Binary file (6.95 kB). View file
 
src/tools/__pycache__/llms.cpython-311.pyc ADDED
Binary file (573 Bytes). View file
 
src/tools/__pycache__/semantic_db.cpython-311.pyc ADDED
Binary file (4.5 kB). View file
 
src/tools/__pycache__/wiki.cpython-311.pyc ADDED
Binary file (2.82 kB). View file
 
src/tools/llm_tools.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wikipedia
2
+ import json
3
+ from langchain import PromptTemplate
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+
7
+ from src.tools.llms import openai_llm
8
+ from src.tools.wiki import Wiki
9
+ from src.model.document import WikiPage
10
+
11
+
12
+
13
+ def get_wikilist(task: {}) -> str:
14
+ """
15
+ get the titles of wiki pages interesting for solving the given task
16
+ """
17
+
18
+ llm = openai_llm
19
+ template = (f"\n"
20
+ f" Your task consists in finding the list of wikipedia page titles which provide useful content "
21
+ f" for a paragraph whose description is delimited by triple backticks: ```{task['description']}```\n"
22
+ f" \n"
23
+ f" The paragraph belongs at the top level of the hierarchy to a document"
24
+ f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n"
25
+ f" Make sure that the paragraph relates the top level of the document\n"
26
+ f" \n"
27
+ f" The paragraph belongs to a higher paragraph in the hierarchy \\n"
28
+ f" whose description is delimited by triple backticks: ``` {task['above']}```\n"
29
+ f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n"
30
+ f" \n"
31
+ f" The paragraphs comes after previous paragraphs \\n"
32
+ f" whose description is delimited by triple backticks: ``` {task['before']}```\n"
33
+ f" Make sure that the paragraph relates with previous paragraph without any repetition\n"
34
+ f" \n"
35
+ f" The paragraphs comes before next paragraphs \\n"
36
+ f" whose description is delimited by triple backticks: ``` {task['after']}```\n"
37
+ f" \n"
38
+ f" Format your response as a JSON list of strings separated by commas.\n"
39
+ f" \n"
40
+ f"\n"
41
+ f" ")
42
+
43
+ prompt = PromptTemplate(
44
+ input_variables=[],
45
+ template=template
46
+ )
47
+
48
+ #wikilist = LLMChain(llm=openai_llm, prompt=prompt).run()
49
+ wikilist = json.loads(llm(template))
50
+
51
+ expanded_wikilist = []
52
+
53
+ expand_factor = 3
54
+
55
+ for wikipage in wikilist:
56
+ expanded_wikilist += wikipedia.search(wikipage, expand_factor)
57
+
58
+ wikilist = list(set(expanded_wikilist))
59
+
60
+ return wikilist
61
+
62
+
63
+ def get_public_paragraph(task: {}) -> str:
64
+ """returns the task directly performed by chat GPT"""
65
+
66
+ llm = openai_llm
67
+ template = (f"\n"
68
+ f" Your task consists in generating a paragraph\\n"
69
+ f" whose description is delimited by triple backticks: ```{task['description']}```\n"
70
+ f"\n"
71
+ f" The paragraph belongs at the top level of the hierarchy to a document \\n"
72
+ f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n"
73
+ f" Make sure that the paragraph relates the top level of the document\n"
74
+ f" \n"
75
+ f" The paragraph belongs to a higher paragraph in the hierarchy \\n"
76
+ f" whose description is delimited by triple backticks: ``` {task['above']}```\n"
77
+ f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n"
78
+ f" \n"
79
+ f" The paragraphs comes after previous paragraphs \\n"
80
+ f" whose description is delimited by triple backticks: ``` {task['before']}```\n"
81
+ f" Make sure that the paragraph relates with previous paragraph without any repetition\n"
82
+ f" \n"
83
+ f" The paragraphs comes before next paragraphs \\n"
84
+ f" whose description is delimited by triple backticks: ``` {task['after']}```\n"
85
+ f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n"
86
+ f" \n"
87
+ f" \n"
88
+ f"\n"
89
+ f" ")
90
+
91
+ p = llm(template)
92
+
93
+ return p
94
+
95
+
96
+ def create_index(wikilist: [str]):
97
+ """
98
+ useful for creating the index of wikipages
99
+ """
100
+ fetch = Wiki().fetch
101
+
102
+ pages = [(title, fetch(title)) for title in wikilist if type(fetch(title)) != str]
103
+ texts = []
104
+ chunk = 800
105
+ for title, page in pages:
106
+ texts.append(WikiPage(title=title, fulltext=page.page_content))
107
+
108
+ doc_splitter = CharacterTextSplitter(
109
+ separator=".",
110
+ chunk_size=chunk,
111
+ chunk_overlap=100,
112
+ length_function=len,
113
+ )
114
+
115
+ paragraphs = texts[0].get_paragraphs(chunk=800)
116
+
117
+ split_texts = []
118
+ for p in paragraphs:
119
+ split_texts += doc_splitter.split_text(p)
120
+
121
+ for split_text in split_texts:
122
+ assert type(split_text) == str
123
+ assert 0 < len(split_text) < 2 * 500
124
+
125
+ wiki_index = Chroma.from_texts(split_texts)
126
+
127
+ return wiki_index
128
+
129
+
130
+ def get_wiki_paragraph(wiki_index, task: {}) -> str:
131
+ """useful to get a summary in one line from wiki index"""
132
+
133
+ task_description = get_public_paragraph(task)
134
+ wiki_paragraphs = semantic_search(wiki_index, task_description)
135
+ text_content = ""
136
+ for p in wiki_paragraphs:
137
+ text_content += p.page_content + "/n/n"
138
+
139
+ template = (f"\n"
140
+ f" Your task consists in generating a paragraph\\n"
141
+ f" whose description is delimited by triple backticks: ```{task['description']}```\n"
142
+ f"\n"
143
+ f" The text generation is based in the documents provided in these sections \n"
144
+ f" delimited by by triple backticks: ``` {text_content}``` \n"
145
+ f" The paragraph belongs at the top level of the hierarchy to a document \\n"
146
+ f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n"
147
+ f" Make sure that the paragraph relates the top level of the document\n"
148
+ f" \n"
149
+ f" The paragraph belongs to a higher paragraph in the hierarchy \\n"
150
+ f" whose description is delimited by triple backticks: ``` {task['above']}```\n"
151
+ f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n"
152
+ f" \n"
153
+ f" The paragraphs comes after previous paragraphs \\n"
154
+ f" whose description is delimited by triple backticks: ``` {task['before']}```\n"
155
+ f" Make sure that the paragraph relates with previous paragraph without any repetition\n"
156
+ f" \n"
157
+ f" The paragraphs comes before next paragraphs \\n"
158
+ f" whose description is delimited by triple backticks: ``` {task['after']}```\n"
159
+ f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n"
160
+ f" \n"
161
+ f" \n"
162
+ f"\n"
163
+ f" ")
164
+
165
+ llm = openai_llm
166
+ p = llm(template)
167
+
168
+ return p
169
+
170
+
171
+ def get_private_paragraph(texts, task: {}) -> str:
172
+ """useful to get a summary in one line from wiki index"""
173
+
174
+ text_content = ""
175
+ for t in texts:
176
+ text_content += t + "/n/n"
177
+
178
+ template = (f"\n"
179
+ f" Your task consists in generating a paragraph\\n"
180
+ f" whose description is delimited by triple backticks: ```{task['description']}```\n"
181
+ f"\n"
182
+ f" The text generation is based in the documents provided in these sections \n"
183
+ f" delimited by by triple backticks: ``` {text_content}``` \n"
184
+ f" The paragraph belongs at the top level of the hierarchy to a document \\n"
185
+ f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n"
186
+ f" Make sure that the paragraph relates the top level of the document\n"
187
+ f" \n"
188
+ f" The paragraph belongs to a higher paragraph in the hierarchy \\n"
189
+ f" whose description is delimited by triple backticks: ``` {task['above']}```\n"
190
+ f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n"
191
+ f" \n"
192
+ f" The paragraphs comes after previous paragraphs \\n"
193
+ f" whose description is delimited by triple backticks: ``` {task['before']}```\n"
194
+ f" Make sure that the paragraph relates with previous paragraph without any repetition\n"
195
+ f" \n"
196
+ f" The paragraphs comes before next paragraphs \\n"
197
+ f" whose description is delimited by triple backticks: ``` {task['after']}```\n"
198
+ f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n"
199
+ f" \n"
200
+ f" \n"
201
+ f"\n"
202
+ f" ")
203
+
204
+ llm = openai_llm
205
+ p = llm(template)
206
+
207
+ return p
src/tools/llms.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain.llms import OpenAI
3
+
4
+ import os
5
+
6
+
7
+ OpenAI_KEY = "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"
8
+ os.environ["OPENAI_API_KEY"] = OpenAI_KEY
9
+ openai_llm = OpenAI(temperature=0)
10
+
11
+ SERPAPI_API_KEY = "dba90c4ecfa942f37e2b9eb2e7c6600ef7fb5c02ab8bbfacef426773df14c06b"
12
+ os.environ["SERPAPI_API_KEY"] = SERPAPI_API_KEY
13
+
14
+
15
+ """
16
+ HF_API_KEY = "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"
17
+ hf_llm = HuggingFaceHub(repo_id="google/flan-t5-small",
18
+ model_kwargs={"temperature": 0, "max_length": 1000},
19
+ huggingfacehub_api_token=HF_API_KEY)
20
+ """
src/tools/semantic_db.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from datetime import datetime
3
+
4
+ chroma_client = chromadb.Client()
5
+
6
+
7
+ def get_or_create_collection(coll_name: str):
8
+ date = coll_name[:6]
9
+ coll = chroma_client.get_or_create_collection(name=coll_name, metadata={"date": date})
10
+ return coll
11
+
12
+
13
+ def get_collection(coll_name: str):
14
+ coll = chroma_client.get_collection(name=coll_name)
15
+ return coll
16
+
17
+
18
+ def reset_collection(coll_name: str):
19
+ coll = chroma_client.get_collection(name=coll_name)
20
+ coll.delete()
21
+ return coll
22
+
23
+
24
+ def delete_old_collections(old=2):
25
+ collections = chroma_client.list_collections()
26
+ current_hour = int(datetime.now().strftime("%m%d%H"))
27
+
28
+ for coll in collections:
29
+ coll_hour = int(coll.metadata['date'])
30
+ if coll_hour < current_hour - old:
31
+ chroma_client.delete_collection(coll.name)
32
+
33
+
34
+ def add_texts_to_collection(coll_name: str, texts: [str], file: str, source: str):
35
+ """
36
+ add texts to a collection : texts originate all from the same file
37
+ """
38
+ coll = chroma_client.get_collection(name=coll_name)
39
+ filenames = [{file: 1, 'source': source} for _ in texts]
40
+ ids = [file+'-'+str(i) for i in range(len(texts))]
41
+ coll.delete(ids=ids)
42
+ coll.add(documents=texts, metadatas=filenames, ids=ids)
43
+
44
+
45
+ def delete_collection(coll_name: str):
46
+ chroma_client.delete_collection(name=coll_name)
47
+
48
+
49
+ def list_collections():
50
+ return chroma_client.list_collections()
51
+
52
+
53
+ def query_collection(coll_name: str, query: str, from_files: [str], n_results: int = 4):
54
+ assert 0 < len(from_files)
55
+ coll = chroma_client.get_collection(name=coll_name)
56
+ where_ = [{file: 1} for file in from_files]
57
+ where_ = where_[0] if len(where_) == 1 else {'$or': where_}
58
+ n_results_ = min(n_results, coll.count())
59
+ ans = coll.query(query_texts=query, n_results=n_results_, where=where_)
60
+ return ans
src/tools/wiki.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ from langchain.docstore.base import Docstore
4
+ from langchain.docstore.document import Document
5
+
6
+
7
+
8
+ class Wiki(Docstore):
9
+ """
10
+ Wrapper around wikipedia API.
11
+ """
12
+
13
+ def __init__(self) -> None:
14
+ """Check that wikipedia package is installed."""
15
+ try:
16
+ import wikipedia # noqa: F401
17
+ except ImportError:
18
+ raise ValueError(
19
+ "Could not import wikipedia python package. "
20
+ "Please install it with `pip install wikipedia`."
21
+ )
22
+
23
+ @staticmethod
24
+ def fetch(searched_page: str) -> Union[str, Document]:
25
+ """
26
+ Try to fetch for wiki page.
27
+
28
+ If page exists, return the page summary, and a PageWithLookups object.
29
+ If page does not exist, return similar entries.
30
+ """
31
+ import wikipedia
32
+
33
+ try:
34
+ # wikipedia.set_lang("fr")
35
+ page_content = wikipedia.page(searched_page).content
36
+ url = wikipedia.page(searched_page).url
37
+ result: Union[str, Document] = Document(
38
+ page_content=page_content, metadata={"page": url}
39
+ )
40
+ except wikipedia.PageError:
41
+ result = f"Could not find [{searched_page}]. Similar: {wikipedia.search(searched_page)}"
42
+
43
+ except wikipedia.DisambiguationError:
44
+ result = f"Could not find [{searched_page}]. Similar: {wikipedia.search(searched_page)}"
45
+ return result
46
+
47
+ def search(searched_context: str) -> [str]:
48
+ """
49
+ Finds wiki page title in relation with the given context
50
+ """
51
+ import wikipedia
52
+
53
+ try:
54
+ # wikipedia.set_lang("fr")
55
+ page_title_list = wikipedia.search(searched_context)
56
+ result = page_title_list
57
+ except wikipedia.PageError:
58
+ result = f"Could not find [{searched_context}]."
59
+ return result
60
+
61
+
tests/.chroma/index/id_to_uuid_0c55a091-9f95-4a8d-b868-83d95412fdc4.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f74f5b108819cf0f64b68349537a233944b0a46f682c367c23f8ce581ed3cea8
3
+ size 444
tests/.chroma/index/id_to_uuid_31dac11a-6e77-49ca-a1b5-fce9e3fe275a.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:445b3f8d8a6b5cc676c01a5d1b4b67946a920fcce477e27e5b48cbdfeca61755
3
+ size 446
tests/.chroma/index/id_to_uuid_37825327-eef6-4255-92ac-787c21197d77.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5ee30e4022243d2b6b479c491ee60b3f5e7c539e3a362328642405f73cf8fee
3
+ size 695
tests/.chroma/index/id_to_uuid_40ba1a00-ce47-4e51-a2d3-56eb96ecb82b.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f3997be16750f5f7b3978674fea2364fa6ab97c2dc426acb80eb1d422511443
3
+ size 98
tests/.chroma/index/id_to_uuid_46204504-325f-47e6-9176-e2054080ad57.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfaa528132e4e0652c83416114d1454c9b1d0b6e0fb1a8ef20b9734edb35323b
3
+ size 287
tests/.chroma/index/id_to_uuid_64afc7c0-c153-47d1-af52-55e1738ae76c.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f833dce2979b320eaec2bb9154f769efb4c634e230149487dd7f221ecc25606
3
+ size 447
tests/.chroma/index/id_to_uuid_69550299-be81-45fa-8bbf-3d83be2d7991.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22ba07768895e87dbf09941729a79568b660241f036aa4d8121353c2facf4bba
3
+ size 288
tests/.chroma/index/id_to_uuid_78f80853-f999-4f5e-b320-41c98bd28592.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c056689ec182e73b665df5dcf2e863da9c5e3445fe588071cb3cd689d318df46
3
+ size 286
tests/.chroma/index/id_to_uuid_90d6076c-bb50-40ed-90a1-2df2243fd12e.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91659f10e125e40759e9951d8ab50e05d48fe8b13eb602ba18ebf12c2f7ef216
3
+ size 97
tests/.chroma/index/id_to_uuid_a10bf13e-424a-41cd-bcfb-27d8072711ea.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbd8492d26db8287ca812566c52964275721c15d0c06fbc160aa130e70433873
3
+ size 286
tests/.chroma/index/id_to_uuid_a6f9bfcf-0593-40b1-a282-a54d5b75d939.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c62d35faab2d47684f6cdefe34f1ac5a45a300aee5eb85e3897c1a2651943675
3
+ size 97
tests/.chroma/index/id_to_uuid_aba244c9-042f-42a3-860c-a68e1ee0b4a5.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94e55e34d33c3c27775e0b6ccffd32c0f845ee8106fe96f667ba0804221d4b0d
3
+ size 98
tests/.chroma/index/id_to_uuid_afc3d29f-a033-4bcf-9ef4-e93b6211ac95.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:138806738237bc2498973bd85fe7493093ed4eaea0fa61bdcf22c15aeaf88da9
3
+ size 441
tests/.chroma/index/id_to_uuid_b5e184d4-5839-4b0b-9bd8-638fa6bc080a.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eb17c0f92ddc34e926384623c01a49f5fb2fe87a4605235402491faa8140af6
3
+ size 97
tests/.chroma/index/id_to_uuid_bcb0093e-68dd-4d75-a758-63ef7a681d92.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45ff2265b5042692a7436434dc4ec86fda1045b264ef22325a633d7c310479b1
3
+ size 441
tests/.chroma/index/id_to_uuid_bf57b36f-a918-4484-b897-79f751d5cad4.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfce13fc2f13ad950016bb48ccbbbd04ce13be07c748476dde1f940fcdfa0d52
3
+ size 5
tests/.chroma/index/id_to_uuid_e208b245-d2cd-4069-9a8c-d5f010d91afb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:276ff93bdb0f54dcda2d4bd238698fdf417d26879c578acb0520865cc1079631
3
+ size 444
tests/.chroma/index/id_to_uuid_f09229bd-8639-49e8-8a84-8e6e0aa11971.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfce13fc2f13ad950016bb48ccbbbd04ce13be07c748476dde1f940fcdfa0d52
3
+ size 5
tests/.chroma/index/index_0c55a091-9f95-4a8d-b868-83d95412fdc4.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89aef35a2393395327581f3629478d4db7b6fd820f317ed13e60d05b4bdcd30d
3
+ size 35444
tests/.chroma/index/index_31dac11a-6e77-49ca-a1b5-fce9e3fe275a.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89aef35a2393395327581f3629478d4db7b6fd820f317ed13e60d05b4bdcd30d
3
+ size 35444
tests/.chroma/index/index_37825327-eef6-4255-92ac-787c21197d77.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcd0348e194c5b4661616aab5d4f4d1847f2a0bd5d5295b6230075a7bfb000ad
3
+ size 35444
tests/.chroma/index/index_40ba1a00-ce47-4e51-a2d3-56eb96ecb82b.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c915470fb0aee4ad80d1313da2ba6caae6ee813ffb23c4a3a4d73dd610e492c2
3
+ size 3456
tests/.chroma/index/index_46204504-325f-47e6-9176-e2054080ad57.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b7bcd06e5d6cc6c260fe5da47466cfc406a9e294063c5033210b435e5e69b6
3
+ size 13604
tests/.chroma/index/index_64afc7c0-c153-47d1-af52-55e1738ae76c.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89aef35a2393395327581f3629478d4db7b6fd820f317ed13e60d05b4bdcd30d
3
+ size 35444
tests/.chroma/index/index_69550299-be81-45fa-8bbf-3d83be2d7991.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b7bcd06e5d6cc6c260fe5da47466cfc406a9e294063c5033210b435e5e69b6
3
+ size 13604
tests/.chroma/index/index_78f80853-f999-4f5e-b320-41c98bd28592.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b7bcd06e5d6cc6c260fe5da47466cfc406a9e294063c5033210b435e5e69b6
3
+ size 13604
tests/.chroma/index/index_90d6076c-bb50-40ed-90a1-2df2243fd12e.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c915470fb0aee4ad80d1313da2ba6caae6ee813ffb23c4a3a4d73dd610e492c2
3
+ size 3456
tests/.chroma/index/index_a10bf13e-424a-41cd-bcfb-27d8072711ea.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b7bcd06e5d6cc6c260fe5da47466cfc406a9e294063c5033210b435e5e69b6
3
+ size 13604