kenleeyx commited on
Commit
243f5cb
·
1 Parent(s): 0c5d2b6

feat: include hash table of tagged tags as output

Browse files

-Added function count_tags which counts occurrences of each tag tagged
and returns the data in a DataFrame
-Updated Interface to also display above DataFrame
-Output file link now includes data from both DataFrames in separate
sheets
-Updated user instructions to reflect above changes
-Amend: update documentation comments for Counter import

Files changed (2) hide show
  1. app.py +46 -3
  2. user_instructions.txt +15 -9
app.py CHANGED
@@ -6,6 +6,7 @@ from openai import OpenAI # For sending the quotes to OpenAI for tagging
6
  import openpyxl # Requirement for reading Excel files into pandas Dataframes
7
  import json # For conversion of OpenAI responses into json/dictionary objects so the contents can be extracted
8
  from dotenv import load_dotenv # For loading environment variables in local environment
 
9
 
10
  # Load environment variables from local .env file if it exists; otherwise this does nothing
11
  load_dotenv()
@@ -51,6 +52,41 @@ def tag_quote(quote: str, tags_list: list) -> list:
51
  )
52
  return json.loads(response.choices[0].message.content)['tags']
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  # Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
55
  def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str) -> pd.DataFrame:
56
  """
@@ -100,11 +136,16 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str
100
  # Tag all the quotes one by one using tag_quote function
101
  quotes_df['Tags'] = quotes_data.apply(tag_quote, args=(tags_list,))
102
 
 
 
 
103
  # Return only the quotes column and the new tags column
104
  output_df = quotes_df[[quotes_col_name, 'Tags']]
105
  output_file_path = "output.xlsx"
106
- output_df.to_excel(output_file_path, index=False)
107
- return output_df, output_file_path
 
 
108
 
109
  # Define user interface structure
110
  demo = gr.Interface(
@@ -115,8 +156,10 @@ demo = gr.Interface(
115
  gr.Textbox(label = "List of tags separated by commas")
116
  ],
117
  outputs=[
118
- gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2),
 
119
  gr.File(label="Output data in file format")
 
120
  ],
121
  title="Automated Research Code Tagger",
122
  description=INSTRUCTIONS
 
6
  import openpyxl # Requirement for reading Excel files into pandas Dataframes
7
  import json # For conversion of OpenAI responses into json/dictionary objects so the contents can be extracted
8
  from dotenv import load_dotenv # For loading environment variables in local environment
9
+ from collections import Counter # For tabulating tag occurrences
10
 
11
  # Load environment variables from local .env file if it exists; otherwise this does nothing
12
  load_dotenv()
 
52
  )
53
  return json.loads(response.choices[0].message.content)['tags']
54
 
55
+ def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
56
+ """
57
+ Creates a DataFrame indicating number of occurences of each tag from a DataFrame column containing lists of tags.
58
+
59
+ This function also takes in a tags_list; all tags in the tags_list will be in the output
60
+ DataFrame even if they do not occur in the input tags_col. There may be some tags appearing
61
+ in the output which were not in the original tag_list; these will be marked with a ! prefix.
62
+
63
+ Args:
64
+ tags_list (list): The list of tags given by the user
65
+ tags_col (pd.Series): A column of lists where each list contains tags which are
66
+ (ideally but not always; depending on OpenAI) selected from the tags_list.
67
+
68
+ Returns:
69
+ pd.DataFrame: A DataFrame with two columns. The first contains individual tags(str) which have
70
+ appeared either in the tags_list, the lists within the tags_col, or both. The second
71
+ contains the number of occurrences(int) of that tag within the lists in the tags_col.
72
+ """
73
+ # Initialise Counter hash table
74
+ tags_counter = Counter({tag: 0 for tag in tags_list})
75
+ # Iterate over the lists in tags_col
76
+ for sublist in tags_col:
77
+ # Iterate over the tags in each list
78
+ for tag in sublist:
79
+ # Update the tags_counter for each tag
80
+ if tag in tags_list:
81
+ tags_counter.update([tag])
82
+ # If the tag was not in the tags_list given by the user, prefix it with a ! before updating
83
+ else:
84
+ tags_counter.update([f"!{tag}"])
85
+ # Convert the tags_counter to a DataFrame and return it
86
+ tags_counter_df = pd.DataFrame(tags_counter.items(), columns=['Tag', 'Count'])
87
+ return tags_counter_df
88
+
89
+
90
  # Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
91
  def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str) -> pd.DataFrame:
92
  """
 
136
  # Tag all the quotes one by one using tag_quote function
137
  quotes_df['Tags'] = quotes_data.apply(tag_quote, args=(tags_list,))
138
 
139
+ # Create hash table of tag occurrences using count_tags function
140
+ tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
141
+
142
  # Return only the quotes column and the new tags column
143
  output_df = quotes_df[[quotes_col_name, 'Tags']]
144
  output_file_path = "output.xlsx"
145
+ with pd.ExcelWriter(output_file_path) as writer:
146
+ output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)
147
+ tags_counter_df.to_excel(writer, sheet_name='Tag Count', index=False)
148
+ return output_df, tags_counter_df, output_file_path
149
 
150
  # Define user interface structure
151
  demo = gr.Interface(
 
156
  gr.Textbox(label = "List of tags separated by commas")
157
  ],
158
  outputs=[
159
+ gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2, label='Coded Quotes'),
160
+ gr.Dataframe(headers=["Tag", "Count"], label='Tag Count'),
161
  gr.File(label="Output data in file format")
162
+
163
  ],
164
  title="Automated Research Code Tagger",
165
  description=INSTRUCTIONS
user_instructions.txt CHANGED
@@ -2,14 +2,20 @@ ABOUT:
2
  This automated tagger takes in a list of tags and a list of input quotes. Each input quote is individually fed to OpenAI's ChatGPT together with the list of tags,
3
  and ChatGPT will respond with the subset of the input tags which are related to the content of the quote.
4
 
5
- HOW TO USE:
6
- 1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)
7
- 2)Type in the name of the column where the quotes are located
8
- 3)Type in a list of tags separated by commas. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc.
9
- This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.
10
- 4)All the responses from ChatGPT will be collated and displayed in the table on the right, together with the original quotes.
11
- You may then copy them into an Excel file for further processing. Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!
12
 
 
 
 
 
 
 
 
13
  Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.
14
- I will not be responsible for mistakes made by the AI, but I can try to fix them if you alert me.
15
- -Kenneth
 
2
  This automated tagger takes in a list of tags and a list of input quotes. Each input quote is individually fed to OpenAI's ChatGPT together with the list of tags,
3
  and ChatGPT will respond with the subset of the input tags which are related to the content of the quote.
4
 
5
+ HOW TO USE:<br>
6
+ 1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)<br>
7
+ 2)Type in the name of the column where the quotes are located<br>
8
+ 3)Type in a list of tags separated by commas. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc. <br>
9
+ This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.<br>
10
+ Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!<br>
 
11
 
12
+ READING THE OUTPUT(in the right-side column): <br>
13
+ First table: All the responses are collated and displayed here, together with the original quotes.<br>
14
+ Second table: Displays all the tags used and the number of occurrences of each tag. <br>
15
+ You may also retrieve both tables in a single Excel file via the link below them.<br>
16
+ Sometimes a quote might be tagged with a tag that you didn't give; in such cases the tag will be prefixed with ! in the second table.
17
+
18
+ DISCLAIMER:<br>
19
  Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.
20
+ I will not be responsible for mistakes made by the AI, but I can try to fix them if you alert me.<br>
21
+ -Kenneth Lee, Perceptech.AI