tunght commited on
Commit
0d5fa0e
·
1 Parent(s): 53440c5

add debug info, working on glossary

Browse files
Files changed (2) hide show
  1. app.py +134 -95
  2. glossary.py +16 -0
app.py CHANGED
@@ -11,6 +11,15 @@ from langchain_anthropic import ChatAnthropic, ChatAnthropicMessages
11
  from langchain_groq import ChatGroq
12
  import openai
13
 
 
 
 
 
 
 
 
 
 
14
 
15
  feature_text = "Brand: Duckly. \nProduct name: Duck runner pro. \nKey properties: t-shirt, for running, sweat wicking, for marathon, 100% cotton."
16
 
@@ -215,12 +224,25 @@ def get_model(model_name):
215
  return chat
216
 
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  def generate(*data):
219
  global visible
220
  print("visible", visible)
221
 
222
  nargs = 9
223
- feature, image, garment_type, model, temperature, excluded_words, included_words, glossary, debug = data[:nargs]
224
  struct_ref = data[nargs:]
225
 
226
  print(f"{feature=}")
@@ -231,9 +253,14 @@ def generate(*data):
231
  print(f"{excluded_words=}")
232
  print(f"{included_words=}")
233
  print(f"{debug=}")
234
- print(f"{glossary=}")
 
235
  # print(f"{struct_ref=}")
236
 
 
 
 
 
237
  chat = get_model(model)
238
 
239
  types, struct_languages = parse_structure(struct_ref)
@@ -255,6 +282,12 @@ def generate(*data):
255
  intended_use = "Intended use: " + ", ".join(image_features["intended_use"])
256
  print(f"Detected features: {detected_features}, Intended use: {intended_use}, Alt text: {alt_texts}")
257
 
 
 
 
 
 
 
258
  batch = []
259
  for i in range(visible + 1):
260
  structure = struct_ref[2 * i]
@@ -288,26 +321,35 @@ Include all included words in the output.
288
  Do not hallucinate any information.
289
  Use creative language in each output.
290
  Rate the quality of each version based on the following criteria:
291
- - how well it follows the reference copy's tone of voice, rythm, cadence and style.
292
  - how well it follows the reference structure.
293
  - how faithful it describes the product features.
 
 
294
  - how creative the language is.
295
  The score should be a number between 0 and 10 with 10 being the best quality.
296
  Return the result in the following JSON format:
297
- [
298
- {{
299
- "id": 1,
300
- "content": The first product description,
301
- "score": The score of the first product description,
302
- "explanation": A less than 20 word explanation of the score of the first product description
303
- }},
304
- {{
305
- "id": 2,
306
- "content": The second product description,
307
- "score": The score of the second product description
308
- "explanation": A less than 20 word explanation of the score of the second product description
309
- }},
310
- ]
 
 
 
 
 
 
 
311
  Make sure that the output is in JSON format, no extra text should be included in the output.
312
 
313
  Product information:
@@ -333,26 +375,35 @@ Include all included words in the output.
333
  Do not hallucinate any information.
334
  Use creative language in each output.
335
  Rate the quality of each version based on the following criteria:
336
- - how well it follows the reference copy's tone of voice, rythm, cadence and style.
337
- - how well it follows the reference copy's structure.
338
  - how faithful it describes the product features.
339
- - how creative the language is.
 
 
340
  The score should be a number between 0 and 10 with 10 being the best quality.
341
  Return the result in the following JSON format:
342
- [
343
- {{
344
- "id": 1,
345
- "content": The first product description,
346
- "score": The score of the first product description,
347
- "explanation": A less than 20 word explanation of the score of the first product description
348
- }},
349
- {{
350
- "id": 2,
351
- "content": The second product description,
352
- "score": The score of the second product description
353
- "explanation": A less than 20 word explanation of the score of the second product description
354
- }},
355
- ]
 
 
 
 
 
 
 
356
  Make sure that the output is in JSON format, no extra text should be included in the output.
357
 
358
  Product information:
@@ -381,23 +432,32 @@ Rate the quality of each version based on the following criteria:
381
  - how well it follows the reference tone of voice, rythm, cadence and style.
382
  - how well it follows the reference structure.
383
  - how faithful it describes the product features.
 
 
384
  - how creative the language is.
385
  The score should be a number between 0 and 10 with 10 being the best quality.
386
  Return the result in the following JSON format:
387
- [
388
- {{
389
- "id": 1,
390
- "content": The first product description,
391
- "score": The score of the first product description,
392
- "explanation": A less than 20 word explanation of the score of the first product description
393
- }},
394
- {{
395
- "id": 2,
396
- "content": The second product description,
397
- "score": The score of the second product description
398
- "explanation": A less than 20 word explanation of the score of the second product description
399
- }},
400
- ]
 
 
 
 
 
 
 
401
  Make sure that the output is in JSON format, no extra text should be included in the output.
402
 
403
  Product information:
@@ -416,61 +476,39 @@ Excluded words: {excluded_words}"""),]
416
  response = chat.batch(batch, temperature=temperature)
417
  print(response)
418
 
419
- # batch = []
420
- # rewrite_map = {}
421
- # for i in range(visible + 1):
422
- # structure = struct_ref[2 * i]
423
- # reference = struct_ref[2 * i + 1]
424
- # if len(reference.strip()) > 0:
425
- # messages = [
426
- # SystemMessage(content=f"""You are a helpful assistant that writes about products for ecommerce websites. You write in {languages[i]} language."""),
427
- # HumanMessage(content=f"""Rewrite the following product description in the style and tone of voice
428
- # of the reference product description.
429
- # Make sure that the structure and length of the output is similar to the reference product description.
430
- # Make sure that the output is written in {languages[i]} language.
431
- # Output the product description in markdown format.
432
-
433
- # Product description to rewirte:
434
- # ```{response[i].content}```
435
-
436
- # Reference product description:
437
- # ```{reference}```
438
- # """)
439
- # ]
440
- # batch.append(messages)
441
- # rewrite_map[i] = len(batch) - 1
442
-
443
- # print("Rewrite_map", rewrite_map)
444
- # print("Rewriting")
445
- # re_response = chat.batch(batch, temperature=temperature)
446
- # for i in range(len(re_response)):
447
- # print(f"Original: {response[i].content}")
448
- # print(f"Rewritten: {re_response[i].content}")
449
- # response = [re_response[rewrite_map[i]] if i in rewrite_map else response[i] for i in range(visible + 1)]
450
- # print("Done rewriting")
451
-
452
  parser = JsonOutputParser()
453
  jresponse = [parser.parse(msg.content) for msg in response]
454
  descriptions = []
455
  for jr in jresponse:
456
- bests = 0
457
- bestd = ""
458
- for d in jr:
459
- print(f'{d["score"]=}, {d["id"]=}, {bests=}')
460
- if d["score"] > bests:
461
- bests = d["score"]
462
- bestd = d["content"] + (f"\n\nDebug info:\n\nScore: {d['score']}\n\nExplanation: {d['explanation']}" if debug else "")
463
- elif d["score"] == bests and random.random() > 0.5:
464
- bestd = d["content"] + (f"\n\nDebug info:\n\nScore: {d['score']}\n\nExplanation: {d['explanation']}" if debug else "")
 
 
 
 
 
 
 
 
 
 
 
465
 
466
  descriptions.append(bestd)
467
- # description = "\n\n\n\n".join([msg.content for msg in response])
468
- md_content = "\n\n\n".join(descriptions)
469
 
470
  alt_texts_str = '\n\n### Alt text\n\n' + '\n- ' + '\n- '.join(alt_texts) if len(alt_texts) > 0 else ""
471
 
472
  alt_text_dict = {k[0]: v for (k, v) in zip(image, alt_texts)} if len(alt_texts) > 0 else {}
473
- result_json = {"outputs": descriptions, "alt_text": alt_text_dict}
474
  result_md = md_content + alt_texts_str + '\n'.join([f'![Product photo](data:image/png;base64,{base64_image} "{alt_text}")' if base64_image != "" else "" for (base64_image, alt_text) in zip(base64_images, alt_texts)])
475
  return result_md, result_json
476
 
@@ -526,7 +564,8 @@ with gr.Blocks() as demo:
526
  temperature = gr.Slider(minimum=0., maximum=1.0, value=0., interactive=True, label="Temperature", visible=True)
527
  excluded_words = gr.Textbox(label="Excluded words", interactive=True, lines=2)
528
  included_words = gr.Textbox(label="Included words", interactive=True, lines=2)
529
- glossary = gr.Dataframe(row_count = (2, "dynamic"), col_count=(2,"static"), headers=["Description", "Way of writing"], label="Glossary", interactive=True)
 
530
  debug = gr.Checkbox(label="Debug", interactive=True, value=True)
531
  with gr.Row():
532
  submit = gr.Button(value="Submit")
@@ -548,7 +587,7 @@ with gr.Blocks() as demo:
548
  md_output = gr.Markdown(label="Output", show_label=True)
549
  json_output = gr.JSON(label="JSON Output")
550
  submit.click(generate, inputs=[feature, image, garment_type, model, temperature,
551
- excluded_words, included_words, glossary, debug, *struct_ref],
552
  outputs=[md_output, json_output])
553
  # advanced.click(show_advanced, inputs=[], outputs=[model, temperature])
554
 
 
11
  from langchain_groq import ChatGroq
12
  import openai
13
 
14
+ from langchain import hub
15
+ from langchain_chroma import Chroma
16
+ from langchain_community.document_loaders import WebBaseLoader, CSVLoader
17
+ from langchain_core.output_parsers import StrOutputParser
18
+ from langchain_core.runnables import RunnablePassthrough
19
+ from langchain_openai import OpenAIEmbeddings
20
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
21
+ from langchain_core.vectorstores import VectorStoreRetriever
22
+
23
 
24
  feature_text = "Brand: Duckly. \nProduct name: Duck runner pro. \nKey properties: t-shirt, for running, sweat wicking, for marathon, 100% cotton."
25
 
 
224
  return chat
225
 
226
 
227
+ def build_glossary(glossary_file, fieldnames=None) -> VectorStoreRetriever:
228
+ loader = CSVLoader(file_path=glossary_file,
229
+ csv_args={"delimiter": ",",
230
+ "quotechar": '"'})
231
+ # "fieldnames": fieldnames})
232
+
233
+ docs = loader.load()
234
+ vectorstore = Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings())
235
+ retriever = vectorstore.as_retriever()
236
+
237
+ return retriever
238
+
239
+
240
  def generate(*data):
241
  global visible
242
  print("visible", visible)
243
 
244
  nargs = 9
245
+ feature, image, garment_type, model, temperature, excluded_words, included_words, glossary_upload, debug = data[:nargs]
246
  struct_ref = data[nargs:]
247
 
248
  print(f"{feature=}")
 
253
  print(f"{excluded_words=}")
254
  print(f"{included_words=}")
255
  print(f"{debug=}")
256
+ # print(f"{glossary=}")
257
+ print(f"{glossary_upload=}")
258
  # print(f"{struct_ref=}")
259
 
260
+ glossary = None
261
+ if glossary_upload is not None:
262
+ glossary = build_glossary(glossary_upload)
263
+
264
  chat = get_model(model)
265
 
266
  types, struct_languages = parse_structure(struct_ref)
 
282
  intended_use = "Intended use: " + ", ".join(image_features["intended_use"])
283
  print(f"Detected features: {detected_features}, Intended use: {intended_use}, Alt text: {alt_texts}")
284
 
285
+ if glossary:
286
+ print("Getting terms")
287
+ terms = glossary.invoke(input=feature + detected_features)
288
+ for term in terms:
289
+ print(term)
290
+
291
  batch = []
292
  for i in range(visible + 1):
293
  structure = struct_ref[2 * i]
 
321
  Do not hallucinate any information.
322
  Use creative language in each output.
323
  Rate the quality of each version based on the following criteria:
324
+ - how well it follows the reference tone of voice, rythm, cadence and style.
325
  - how well it follows the reference structure.
326
  - how faithful it describes the product features.
327
+ - how well it avoid the excluded words.
328
+ - how well it includes the included words.
329
  - how creative the language is.
330
  The score should be a number between 0 and 10 with 10 being the best quality.
331
  Return the result in the following JSON format:
332
+ {{
333
+ "versions": [
334
+ {{
335
+ "id": 1,
336
+ "content": The first product description,
337
+ "explanation": A less than 20 word explanation of the score of the first product description,
338
+ "score": The score of the first product description
339
+ }},
340
+ {{
341
+ "id": 2,
342
+ "content": The second product description,
343
+ "explanation": A less than 20 word explanation of the score of the first product description,
344
+ "score": The score of the second product description
345
+ }},
346
+ ...
347
+ ],
348
+ "best_version": {{
349
+ "id": The id of the best version,
350
+ "explanation": Explanation for why this version is the best
351
+ }}
352
+ }}
353
  Make sure that the output is in JSON format, no extra text should be included in the output.
354
 
355
  Product information:
 
375
  Do not hallucinate any information.
376
  Use creative language in each output.
377
  Rate the quality of each version based on the following criteria:
378
+ - how well it follows the reference tone of voice, rythm, cadence and style.
379
+ - how well it follows the reference structure.
380
  - how faithful it describes the product features.
381
+ - how well it avoid the excluded words.
382
+ - how well it includes the included words.
383
+ - how creative the language is.
384
  The score should be a number between 0 and 10 with 10 being the best quality.
385
  Return the result in the following JSON format:
386
+ {{
387
+ "versions": [
388
+ {{
389
+ "id": 1,
390
+ "content": The first product description,
391
+ "explanation": A less than 20 word explanation of the score of the first product description,
392
+ "score": The score of the first product description
393
+ }},
394
+ {{
395
+ "id": 2,
396
+ "content": The second product description,
397
+ "explanation": A less than 20 word explanation of the score of the first product description,
398
+ "score": The score of the second product description
399
+ }},
400
+ ...
401
+ ],
402
+ "best_version": {{
403
+ "id": The id of the best version,
404
+ "explanation": Explanation for why this version is the best
405
+ }}
406
+ }}
407
  Make sure that the output is in JSON format, no extra text should be included in the output.
408
 
409
  Product information:
 
432
  - how well it follows the reference tone of voice, rythm, cadence and style.
433
  - how well it follows the reference structure.
434
  - how faithful it describes the product features.
435
+ - how well it avoid the excluded words.
436
+ - how well it includes the included words.
437
  - how creative the language is.
438
  The score should be a number between 0 and 10 with 10 being the best quality.
439
  Return the result in the following JSON format:
440
+ {{
441
+ "versions": [
442
+ {{
443
+ "id": 1,
444
+ "content": The first product description,
445
+ "explanation": A less than 20 word explanation of the score of the first product description,
446
+ "score": The score of the first product description
447
+ }},
448
+ {{
449
+ "id": 2,
450
+ "content": The second product description,
451
+ "explanation": A less than 20 word explanation of the score of the first product description,
452
+ "score": The score of the second product description
453
+ }},
454
+ ...
455
+ ],
456
+ "best_version": {{
457
+ "id": The id of the best version,
458
+ "explanation": Explanation for why this version is the best
459
+ }}
460
+ }}
461
  Make sure that the output is in JSON format, no extra text should be included in the output.
462
 
463
  Product information:
 
476
  response = chat.batch(batch, temperature=temperature)
477
  print(response)
478
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
479
  parser = JsonOutputParser()
480
  jresponse = [parser.parse(msg.content) for msg in response]
481
  descriptions = []
482
  for jr in jresponse:
483
+ print(f'{jr=}')
484
+ bestid = jr["best_version"]["id"]
485
+ for d in jr["versions"]:
486
+ if d["id"] == bestid:
487
+ bestd = d["content"] + (f"\n\nDebug info:\n\nScore: {d['score']}\n\nExplanation: {jr['best_version']['explanation']}" if debug else "")
488
+ bests = d["score"]
489
+ break
490
+ # bests = 0
491
+ # bestd = ""
492
+ # for d in jr:
493
+ # print(f'{d["score"]=}, {d["id"]=}, {bests=}')
494
+ # if d["score"] > bests:
495
+ # bests = d["score"]
496
+ # bestd = d["content"] + (f"\n\nDebug info:\n\nScore: {d['score']}\n\nExplanation: {d['explanation']}" if debug else "")
497
+ # elif d["score"] == bests and random.random() > 0.5:
498
+ # bestd = d["content"] + (f"\n\nDebug info:\n\nScore: {d['score']}\n\nExplanation: {d['explanation']}" if debug else "")
499
+ # if d["id"] == bestid:
500
+ # bests = d["score"]
501
+ # bestd = d["content"] + (f"\n\nDebug info:\n\nScore: {d['score']}\n\nExplanation: {d['explanation']}" if debug else "")
502
+ # break
503
 
504
  descriptions.append(bestd)
505
+
506
+ md_content = "\n\n---\n\n".join(descriptions)
507
 
508
  alt_texts_str = '\n\n### Alt text\n\n' + '\n- ' + '\n- '.join(alt_texts) if len(alt_texts) > 0 else ""
509
 
510
  alt_text_dict = {k[0]: v for (k, v) in zip(image, alt_texts)} if len(alt_texts) > 0 else {}
511
+ result_json = {"outputs": jresponse if debug else descriptions, "alt_text": alt_text_dict}
512
  result_md = md_content + alt_texts_str + '\n'.join([f'![Product photo](data:image/png;base64,{base64_image} "{alt_text}")' if base64_image != "" else "" for (base64_image, alt_text) in zip(base64_images, alt_texts)])
513
  return result_md, result_json
514
 
 
564
  temperature = gr.Slider(minimum=0., maximum=1.0, value=0., interactive=True, label="Temperature", visible=True)
565
  excluded_words = gr.Textbox(label="Excluded words", interactive=True, lines=2)
566
  included_words = gr.Textbox(label="Included words", interactive=True, lines=2)
567
+ # glossary = gr.Dataframe(row_count = (2, "dynamic"), col_count=(2,"static"), headers=["Description", "Way of writing"], label="Glossary", interactive=True)
568
+ glossary_upload = gr.UploadButton(label="Upload Glossary", interactive=True, file_types=["csv"])
569
  debug = gr.Checkbox(label="Debug", interactive=True, value=True)
570
  with gr.Row():
571
  submit = gr.Button(value="Submit")
 
587
  md_output = gr.Markdown(label="Output", show_label=True)
588
  json_output = gr.JSON(label="JSON Output")
589
  submit.click(generate, inputs=[feature, image, garment_type, model, temperature,
590
+ excluded_words, included_words, glossary_upload, debug, *struct_ref],
591
  outputs=[md_output, json_output])
592
  # advanced.click(show_advanced, inputs=[], outputs=[model, temperature])
593
 
glossary.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import traceback
3
+ import gradio as gr
4
+ import numpy as np
5
+ import os
6
+ from langchain_core.output_parsers import JsonOutputParser
7
+
8
+ from langchain_openai.chat_models import ChatOpenAI
9
+ from langchain.schema import HumanMessage, SystemMessage, AIMessage
10
+ from langchain_anthropic import ChatAnthropic, ChatAnthropicMessages
11
+ from langchain_groq import ChatGroq
12
+ import openai
13
+
14
+
15
+ def glossary_rag():
16
+ pass