kenleeyx commited on
Commit
f6a99c0
·
1 Parent(s): 4716193

docs: Docstrings, comments and type hints added

Browse files

Also moved the prompt and user instructions to new files prompt.txt and
user_instructions.txt to de-clutter the app.py.

Files changed (3) hide show
  1. app.py +78 -68
  2. prompt.txt +17 -0
  3. user_instructions.txt +15 -0
app.py CHANGED
@@ -1,104 +1,114 @@
1
- import gradio as gr
2
- import os
3
- import pandas as pd
4
- from openai import OpenAI
5
- import openpyxl
6
- import json
 
7
 
 
 
 
 
 
 
 
 
 
8
  client = OpenAI(
9
  api_key=os.getenv('OPENAI_KEY'),
10
  organization=os.getenv('ORG_KEY'),
11
  project=os.getenv('PROJ_KEY')
12
  )
13
 
14
- #need to give info on how to convert to CSV
15
- title = "Automated Research Code Tagger"
16
- description = """
17
- ABOUT:\n
18
- This automated tagger takes in a list of tags and a list of input quotes. Each input quote is individually fed to OpenAI's ChatGPT together with the list of tags,
19
- and ChatGPT will respond with the subset of the input tags which are related to the content of the quote.\n
20
-
21
- HOW TO USE:\n
22
- 1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)\n
23
- 2)Type in the name of the column where the quotes are located\n
24
- 3)Type in a list of tags separated by commas. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc.
25
- This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.\n
26
- 4)All the responses from ChatGPT will be collated and displayed in the table on the right, together with the original quotes.
27
- You may then copy them into an Excel file for further processing. Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!\n
28
-
29
- Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.
30
- I will not be responsible for mistakes made by the AI, but I can try to fix them if you alert me.
31
- -Kenneth
32
- """
33
-
34
- prompt = """
35
- Given the quote below and the regular tag list below, evaluate each tag in the tag list and determine if the meaning of the quote can be described by that tag topic.
36
- If so, return the relevant tag in your response. Use only the tags provided in the list. Under no circumstances should you create new tag names.
37
-
38
- For the tags starting with a *, these tags should be treated as proper nouns(usually product names or slogans) and should not be used unless the quote explicitly contains the entire tag.
39
- For quotes with meanings that are more ambiguous and can relate to multiple tags, make no assumptions about their meanings and only add tags if the topic of the tag is actually mentioned in the quote.
40
- If there are no relevant tags to the quote, return an empty list.
41
-
42
- Quote:
43
- {quote}
44
-
45
- Tag list:
46
- {tags_list}
47
-
48
- Respond in the following format:
49
- {{
50
- "tags":[<tagName1>, <tagName2>]
51
- }}
52
- """
53
- def tag_quote(quote, tags_list):
54
  response = client.chat.completions.create(
55
  model = "gpt-4o-mini",
56
  response_format={"type": "json_object"},
57
  messages=[
58
  {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
59
- {"role": "user", "content": prompt.format(tags_list=tags_list, quote=quote)}
60
  ]
61
  )
62
- print(response.choices[0].message.content)
63
  return json.loads(response.choices[0].message.content)['tags']
64
-
65
- def process_quotes(quotes_file_path, quotes_col_name, tags_string):
66
- print(quotes_file_path)
67
- print(quotes_col_name)
68
- print(tags_string)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  tags_list = tags_string.split(',')
70
  tags_list = [tag.strip() for tag in tags_list]
71
 
72
- #next 3 lines are necessary as pd.read_excel will rename duplicate columns found in the excel file eg foo -> foo.1, hence we need to extract the first row alone and not as header, and then set it as header for the rest of the DF later.
73
- quotes_df_cols= pd.read_excel(quotes_file_path, header=None, nrows=1).values[0] #creates a df without header from the excel and takes the first row
74
- quotes_df = pd.read_excel(quotes_file_path, header=None, skiprows=1) # converts row 2 onwards into the DF, without specifying a header
75
- quotes_df.columns = quotes_df_cols # sets the first row of excel file as header
 
 
 
 
76
 
 
77
  count = quotes_df.columns.tolist().count(quotes_col_name)
78
  if count == 0:
79
  raise gr.Error("No columns with this name found")
80
  elif count > 1:
81
- print("Count>1!!")
82
  raise gr.Error("Multiple columns with this name found, please rename to something unique")
83
  quotes_data = quotes_df[quotes_col_name]
 
 
84
  quotes_df['Tags'] = quotes_data.apply(tag_quote, args=(tags_list,))
 
 
85
  return quotes_df[[quotes_col_name, 'Tags']]
86
 
87
  demo = gr.Interface(
88
  fn=process_quotes,
89
  inputs=[
90
- gr.File(label="Quotes Excel File"), # File as generated by TFT software
91
- gr.Textbox(label="Name of quotes column"), # use this to identify the col with the quotes
92
- gr.Textbox(label = "List of tags separated by commas")
93
  ],
94
  outputs=gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2),
95
- title=title,
96
- description=description
97
  )
98
 
99
  demo.launch()
100
-
101
- # For later when I enable usage of own API key
102
- # api_key = gr.Textbox(
103
- # type="password", label="Enter your OpenAI API key here (Optional for Perceptech users)"
104
- # )
 
1
+ # Import necessary modules
2
+ import gradio as gr # For creating the interactive user interface
3
+ import os # For accessing environment variables
4
+ import pandas as pd # For easier data handling
5
+ from openai import OpenAI # For sending the quotes to OpenAI for tagging
6
+ import openpyxl # Requirement for reading Excel files into pandas Dataframes
7
+ import json # For conversion of OpenAI responses into json/dictionary objects so the contents can be extracted
8
 
9
+ # Import prompt for requesting the tags from OpenAI
10
+ with open("prompt.txt", "r") as prompt_file:
11
+ PROMPT = prompt_file.read()
12
+
13
+ # Import user instructions for display on screen
14
+ with open("user_instructions.txt", "r") as user_instruction_file:
15
+ INSTRUCTIONS = user_instruction_file.read()
16
+
17
+ #Initialising the OpenAI client
18
  client = OpenAI(
19
  api_key=os.getenv('OPENAI_KEY'),
20
  organization=os.getenv('ORG_KEY'),
21
  project=os.getenv('PROJ_KEY')
22
  )
23
 
24
+ # Function to send the prompt with quote and tag list to OpenAI and get the tags for that quote back
25
+ def tag_quote(quote: str, tags_list: list) -> list:
26
+ """
27
+ Generates a list of tags for a given quote based on a predefined list of potential tags.
28
+
29
+ This function uses a GPT-based language model to analyze the input quote and determine
30
+ the most relevant tags from the provided list. The response is parsed from the JSON
31
+ output of the model and returned as a list of tags.
32
+
33
+ Args:
34
+ quote (str): The quote or text to be analyzed.
35
+ tags_list (list): A list of potential tags to match against the quote.
36
+
37
+ Returns:
38
+ list: A list of tags that are relevant to the quote, as determined by the model.
39
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  response = client.chat.completions.create(
41
  model = "gpt-4o-mini",
42
  response_format={"type": "json_object"},
43
  messages=[
44
  {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
45
+ {"role": "user", "content": PROMPT.format(tags_list=tags_list, quote=quote)}
46
  ]
47
  )
 
48
  return json.loads(response.choices[0].message.content)['tags']
49
+
50
+ # Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
51
+ def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str) -> pd.DataFrame:
52
+ """
53
+ Processes quotes from an Excel file and assigns relevant tags to each quote.
54
+
55
+ This function reads an Excel file containing quotes, validates the column containing
56
+ the quotes, and applies the `tag_quote` function to assign tags to each quote.
57
+ The tags are derived from a user-provided comma-separated string.
58
+
59
+ Args:
60
+ quotes_file_path (str): Path to the Excel file containing the quotes.
61
+ quotes_col_name (str): The name of the column containing the quotes.
62
+ tags_string (str): A comma-separated string of potential tags.
63
+
64
+ Returns:
65
+ pd.DataFrame: A DataFrame with two columns:
66
+ - The original column containing the quotes.
67
+ - A new column 'Tags' with the tags assigned to each quote.
68
+
69
+ Raises:
70
+ gr.Error: If the specified column name does not exist or is not unique.
71
+
72
+ Example:
73
+ >>> process_quotes("quotes.xlsx", "Quote", "inspirational, funny, motivational")
74
+ Outputs a DataFrame with 'Quote' and 'Tags' columns indicating which tags are assigned to which quotes.
75
+ """
76
  tags_list = tags_string.split(',')
77
  tags_list = [tag.strip() for tag in tags_list]
78
 
79
+ # Transfer quotes data from Excel file into pandas DataFrame, handling potential duplicate column names in the Excel file
80
+ # pd.read_excel will rename duplicates eg foo -> foo.1, causing a mismatch between quotes_col_name and the actual column name
81
+ # Extract the first row(the actual header for the DataFrame) as a DataFrame without header.
82
+ quotes_df_cols= pd.read_excel(quotes_file_path, header=None, nrows=1).values[0]
83
+ # Extract all the other rows of the Excel file as a DataFrame without header
84
+ quotes_df = pd.read_excel(quotes_file_path, header=None, skiprows=1)
85
+ # Set the extracted first row as the header for the DataFrame resultant from the other rows
86
+ quotes_df.columns = quotes_df_cols
87
 
88
+ # Check that the column name given by the user exists and is unique
89
  count = quotes_df.columns.tolist().count(quotes_col_name)
90
  if count == 0:
91
  raise gr.Error("No columns with this name found")
92
  elif count > 1:
 
93
  raise gr.Error("Multiple columns with this name found, please rename to something unique")
94
  quotes_data = quotes_df[quotes_col_name]
95
+
96
+ # Tag all the quotes one by one using tag_quote function
97
  quotes_df['Tags'] = quotes_data.apply(tag_quote, args=(tags_list,))
98
+
99
+ # Return only the quotes column and the new tags column
100
  return quotes_df[[quotes_col_name, 'Tags']]
101
 
102
  demo = gr.Interface(
103
  fn=process_quotes,
104
  inputs=[
105
+ gr.File(label="Quotes Excel File"),
106
+ gr.Textbox(label="Name of quotes column"),
107
+ gr.Textbox(label = "List of tags separated by commas")
108
  ],
109
  outputs=gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2),
110
+ title="Automated Research Code Tagger",
111
+ description=INSTRUCTIONS
112
  )
113
 
114
  demo.launch()
 
 
 
 
 
prompt.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Given the quote below and the regular tag list below, evaluate each tag in the tag list and determine if the meaning of the quote can be described by that tag topic.
2
+ If so, return the relevant tag in your response. Use only the tags provided in the list. Under no circumstances should you create new tag names.
3
+
4
+ For the tags starting with a *, these tags should be treated as proper nouns(usually product names or slogans) and should not be used unless the quote explicitly contains the entire tag.
5
+ For quotes with meanings that are more ambiguous and can relate to multiple tags, make no assumptions about their meanings and only add tags if the topic of the tag is actually mentioned in the quote.
6
+ If there are no relevant tags to the quote, return an empty list.
7
+
8
+ Quote:
9
+ {quote}
10
+
11
+ Tag list:
12
+ {tags_list}
13
+
14
+ Respond in the following format:
15
+ {{
16
+ "tags":[<tagName1>, <tagName2>]
17
+ }}
user_instructions.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ABOUT:
2
+ This automated tagger takes in a list of tags and a list of input quotes. Each input quote is individually fed to OpenAI's ChatGPT together with the list of tags,
3
+ and ChatGPT will respond with the subset of the input tags which are related to the content of the quote.
4
+
5
+ HOW TO USE:
6
+ 1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)
7
+ 2)Type in the name of the column where the quotes are located
8
+ 3)Type in a list of tags separated by commas. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc.
9
+ This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.
10
+ 4)All the responses from ChatGPT will be collated and displayed in the table on the right, together with the original quotes.
11
+ You may then copy them into an Excel file for further processing. Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!
12
+
13
+ Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.
14
+ I will not be responsible for mistakes made by the AI, but I can try to fix them if you alert me.
15
+ -Kenneth