AmirMoris commited on
Commit
71cb3e6
·
verified ·
1 Parent(s): 0e4baf9

Delete Textual Dataset Generation

Browse files
Textual Dataset Generation/.idea/.gitignore DELETED
@@ -1,8 +0,0 @@
1
- # Default ignored files
2
- /shelf/
3
- /workspace.xml
4
- # Editor-based HTTP Client requests
5
- /httpRequests/
6
- # Datasource local storage ignored files
7
- /dataSources/
8
- /dataSources.local.xml
 
 
 
 
 
 
 
 
 
Textual Dataset Generation/.idea/Textual Dataset Generation.iml DELETED
@@ -1,9 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <module type="JAVA_MODULE" version="4">
3
- <component name="NewModuleRootManager" inherit-compiler-output="true">
4
- <exclude-output />
5
- <content url="file://$MODULE_DIR$" />
6
- <orderEntry type="inheritedJdk" />
7
- <orderEntry type="sourceFolder" forTests="false" />
8
- </component>
9
- </module>
 
 
 
 
 
 
 
 
 
 
Textual Dataset Generation/.idea/misc.xml DELETED
@@ -1,6 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="11" project-jdk-type="JavaSDK">
4
- <output url="file://$PROJECT_DIR$/out" />
5
- </component>
6
- </project>
 
 
 
 
 
 
 
Textual Dataset Generation/.idea/modules.xml DELETED
@@ -1,8 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="ProjectModuleManager">
4
- <modules>
5
- <module fileurl="file://$PROJECT_DIR$/.idea/Textual Dataset Generation.iml" filepath="$PROJECT_DIR$/.idea/Textual Dataset Generation.iml" />
6
- </modules>
7
- </component>
8
- </project>
 
 
 
 
 
 
 
 
 
Textual Dataset Generation/.idea/workspace.xml DELETED
@@ -1,83 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="ChangeListManager">
4
- <list default="true" id="788fdcfa-b47d-4627-847f-ec90b9a0d9dd" name="Changes" comment="" />
5
- <option name="SHOW_DIALOG" value="false" />
6
- <option name="HIGHLIGHT_CONFLICTS" value="true" />
7
- <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
8
- <option name="LAST_RESOLUTION" value="IGNORE" />
9
- </component>
10
- <component name="ProjectColorInfo"><![CDATA[{
11
- "associatedIndex": 5
12
- }]]></component>
13
- <component name="ProjectId" id="2hyRbehAgzDFNKMNHP7EtzBH3c2" />
14
- <component name="ProjectViewState">
15
- <option name="hideEmptyMiddlePackages" value="true" />
16
- <option name="showLibraryContents" value="true" />
17
- </component>
18
- <component name="PropertiesComponent"><![CDATA[{
19
- "keyToString": {
20
- "Python.web scrap.executor": "Run",
21
- "RunOnceActivity.OpenProjectViewOnStart": "true",
22
- "RunOnceActivity.ShowReadmeOnStart": "true",
23
- "kotlin-language-version-configured": "true",
24
- "last_opened_file_path": "C:/Users/Amir/Desktop/Textual Dataset Generation",
25
- "nodejs_package_manager_path": "npm",
26
- "project.structure.last.edited": "SDKs",
27
- "project.structure.proportion": "0.15",
28
- "project.structure.side.proportion": "0.22758621",
29
- "vue.rearranger.settings.migration": "true"
30
- }
31
- }]]></component>
32
- <component name="RunManager">
33
- <configuration name="web scrap" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
34
- <module name="Textual Dataset Generation" />
35
- <option name="ENV_FILES" value="" />
36
- <option name="INTERPRETER_OPTIONS" value="" />
37
- <option name="PARENT_ENVS" value="true" />
38
- <envs>
39
- <env name="PYTHONUNBUFFERED" value="1" />
40
- </envs>
41
- <option name="SDK_HOME" value="" />
42
- <option name="SDK_NAME" value="Python 3.11 (base)" />
43
- <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
44
- <option name="IS_MODULE_SDK" value="false" />
45
- <option name="ADD_CONTENT_ROOTS" value="true" />
46
- <option name="ADD_SOURCE_ROOTS" value="true" />
47
- <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
48
- <option name="SCRIPT_NAME" value="$PROJECT_DIR$/web scrap.py" />
49
- <option name="PARAMETERS" value="" />
50
- <option name="SHOW_COMMAND_LINE" value="false" />
51
- <option name="EMULATE_TERMINAL" value="false" />
52
- <option name="MODULE_MODE" value="false" />
53
- <option name="REDIRECT_INPUT" value="false" />
54
- <option name="INPUT_FILE" value="" />
55
- <method v="2" />
56
- </configuration>
57
- </component>
58
- <component name="SharedIndexes">
59
- <attachedChunks>
60
- <set>
61
- <option value="jdk-11.0.21-corretto-11.0.21-3183f394aec4-d55de845" />
62
- </set>
63
- </attachedChunks>
64
- </component>
65
- <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
66
- <component name="TaskManager">
67
- <task active="true" id="Default" summary="Default task">
68
- <changelist id="788fdcfa-b47d-4627-847f-ec90b9a0d9dd" name="Changes" comment="" />
69
- <created>1718564403600</created>
70
- <option name="number" value="Default" />
71
- <option name="presentableId" value="Default" />
72
- <updated>1718564403600</updated>
73
- <workItem from="1718564404645" duration="101000" />
74
- </task>
75
- <servers />
76
- </component>
77
- <component name="TypeScriptGeneratedFilesManager">
78
- <option name="version" value="3" />
79
- </component>
80
- <component name="com.intellij.coverage.CoverageDataManagerImpl">
81
- <SUITE FILE_PATH="coverage/Textual_Dataset_Generation$web_scrap.coverage" NAME="web scrap Coverage Results" MODIFIED="1718564477196" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
82
- </component>
83
- </project>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Textual Dataset Generation/Dataset/input_dataset.csv DELETED
@@ -1,161 +0,0 @@
1
- category,caption
2
- shoes,a person sitting on top of a wooden bench
3
- shoes,unpaired red Nike sneaker
4
- shoes,brown Nike sneaker on yellow textile
5
- shoes,unpaired maroon plimsoll on top of yellow textile
6
- shoes,a person with their feet up in the air
7
- shoes,white and blue nike air force 1 high
8
- shoes,pair of white-and-orange athletic shoes on white box
9
- shoes,green and black nike athletic shoe
10
- shoes,"Various colorful sports shoes laid on sand beach background, studio shot, flat lay."
11
- shoes,unpaired OFF WHITE X Nike Air Force 1 low-top sneaker
12
- shoes,white and red nike athletic shoe
13
- shoes,person wearing white Nike running shoes standing on black concrete path
14
- shoes,unpaired gray Nike running shoe
15
- shoes,pair of blue-and-pink floral almond-toe pumps
16
- shoes,pair of gray running shoes
17
- shoes,Close up of a young skater girl's feet and skateboard
18
- shoes,white black and red nike air max 90
19
- shoes,a pair of white and brown shoes on a table
20
- shoes,unpaired red Air Jordan 12
21
- sneakers,a man sitting on a brick wall next to a skateboard
22
- sneakers,a person wearing a hat
23
- sneakers,black white and red nike high top sneaker
24
- sneakers,two people sitting on the ground with their legs crossed
25
- sneakers,pair of black-white-and-red Air Jordan 1 shoes
26
- sneakers,Nike shoe lot
27
- sneakers,white nike air force 1 low
28
- sneakers,person wearing white nike sneakers
29
- sneakers,a man holding a pair of purple shoes
30
- sneakers,close-up photography of person wears brown-and-white Nike Air Max
31
- sneakers,person holding white and red heart print box
32
- sneakers,pair of Carhartt x Nike Air Force 1 shoes
33
- sneakers,Legs and sneakers of teenage boys and girls standing on the sidewalk
34
- sneakers,man sitting on the ledge of a building wearing Air Jordan 1 low-top shoes
35
- sneakers,blue white and red neon light
36
- sneakers,person wearing black pants and blue and white nike sneakers
37
- heels,a person standing on top of a wooden stool
38
- heels,woman in pink patent leather stilettos
39
- heels,woman in black leather heeled shoes
40
- heels,a pair of shoes sitting on top of a couch
41
- heels,red and white plastic frame on white table
42
- heels,pair of women's brown pointed-toe pumps on board
43
- heels,woman wearing brown leather heeled sandals walking on staircase
44
- heels,a pair of woman's legs wearing black gloves and red high heels
45
- heels,a pair of purple high heeled shoes on a woman's leg
46
- heels,person in black pants and red shoes
47
- heels,womens white and silver peep toe pumps
48
- heels,a woman wearing red high heels standing on a wooden floor
49
- heels,black leather boot on white surface
50
- heels,gray leather peep toe sandals
51
- heels,brown leather boots on white table
52
- heels,a woman's legs wearing red high heels
53
- heels,black leather peep toe heeled shoes
54
- heels,women's seven assorted-color footwear on surface
55
- heels,womens brown leather peep toe heeled shoes
56
- watches,a man is holding onto a railing in a room
57
- watches,blue and silver analog watch at 10 00
58
- watches,gold and white analog watch
59
- watches,round silver-colored analog watch with black strap
60
- watches,Close up of man putting watch on his hand.
61
- watches,brown and white analog watch at 10 10
62
- watches,round silver-colored watch on rack during sunset
63
- watches,silver and black chronograph watch
64
- watches,a person holding a cup of tea in their hand
65
- watches,round gray analog watch with brown band
66
- watches,watch at 10:34
67
- watches,silver and white round analog watch
68
- watches,Man in black suit wear new watches. Luxury style
69
- watches,person wearing silver link bracelet round analog watch
70
- watches,round silver-colored Nomos watch at 10:10
71
- watches,silver link bracelet round chronograph watch
72
- watches,a woman sitting at a table with a cell phone in her hand
73
- watches,black and silver round analog watch
74
- watches,watch at 8:45
75
- watches,blue and gold analog watch
76
- pants,three pairs of jeans are lined up on a white surface
77
- pants,person in pink pants and white shoes
78
- pants,blue denim jeans on white textile
79
- pants,person wears blue jeans
80
- pants,a pair of black leather boots sitting on top of a pile of blue jeans
81
- pants,man wearing brown fitted jeans and sneakers standing on road at daytime
82
- pants,three assorted-color denim bottoms
83
- pants,blue denim jeans on brown clothes hanger
84
- pants,a man walking across a bridge carrying a suitcase
85
- pants,man in white t-shirt and black pants standing on white floor
86
- pants,woman in orange pants
87
- pants,selective focus photography of hanged denim jeans
88
- pants,a person sitting on a bench with their legs crossed
89
- pants,woman in blue denim jeans and white sneakers
90
- pants,blue denim jeans on black surface
91
- pants,a pair of hands holding a pair of jeans
92
- pants,closeup photo of person hiding his right hand in his pocket
93
- pants,woman in white tank top and blue denim jeans standing on beach during daytime
94
- pants,woman in gray tank top and gray pants
95
- clothing,a piece of cloth sitting on top of a table
96
- clothing,"gray cardigan, blue jeans, and pair of brown chunky heeled shoes"
97
- clothing,green clothes hanger
98
- clothing,closeup of hanged shirts on rack
99
- clothing,a woman holding a pair of jeans
100
- clothing,women's white long sleeve shirt
101
- clothing,pair of white low-top sneakers
102
- clothing,hanged jeans lot
103
- clothing,a close up of a pair of blue jeans
104
- clothing,gray dress shirt hang on brown wooden rack in front of window with white curtain
105
- clothing,white and red nike air force 1 high
106
- clothing,woman standing selecting clothes
107
- clothing,a stack of folded clothes on a table next to a lamp
108
- clothing,woman waering black blouse and hat
109
- clothing,woman in red long sleeve dress
110
- clothing,assorted color folded shirts on wooden panel
111
- clothing,a pile of folded clothes sitting on top of a brown leather chair
112
- clothing,selective focus photography of hanged three gray tee shirts
113
- clothing,white and blue cat-printed crew-neck T-shirt
114
- clothing,assorted-color shirt lot hang on rack
115
- dress,a woman sitting on a bed in a room
116
- dress,woman in red sleeveless dress standing on gray concrete floor during daytime
117
- dress,green sleeveless dress hanged on white wall
118
- dress,woman in seashore
119
- dress,a woman standing on a beach holding a banana leaf
120
- dress,woman walking on seaside while holding woven bag
121
- dress,woman in yellow and white floral dress
122
- dress,woman wearing black turtleneck long-sleeved dress
123
- dress,a woman kneeling on a white background posing for a picture
124
- dress,woman walking down stair under clear blue sky during daytime
125
- dress,black leather spaghetti strap dress
126
- dress,woman standing in front of white wall
127
- dress,woman in beige floral sleeveless dress
128
- dress,woman wearing pink dress while standing near black metal rail at daytime
129
- dress,a rack of clothes hanging on a wall
130
- dress,woman in brown long sleeve dress standing beside gray wall
131
- dress,woman in green long sleeve dress standing on brown field during daytime
132
- dress,selective focus photo of smiling woman wearing black dress standing on concrete pavement
133
- shirt,a close up of an umbrella with a metal handle
134
- shirt,white crew neck t-shirt
135
- shirt,man in white dress shirt wearing black sunglasses
136
- shirt,gray button up long sleeve shirt
137
- shirt,"A young businessman standing in corridor outside office, looking at camera."
138
- shirt,black crew neck t-shirt
139
- shirt,blue and white checkered dress shirt
140
- shirt,man in black and white plaid button up shirt wearing black sunglasses
141
- shirt,white button up shirt on clothes hanger
142
- shirt,white crew neck long sleeve shirt
143
- shirt,A fathers day greeting card concept. Flat lay. Copy space.
144
- shirt,man in white dress shirt with black and white polka dots necktie
145
- shirt,woman wearing orange crew-neck sweatshirt standing while putting right hand on her head
146
- shirt,a person holding a white shirt on a hanger
147
- shirt,man in blue dress shirt wearing black sunglasses
148
- shirt,man in white crew neck t-shirt standing on green grass field during daytime
149
- t-shirt,a young man wearing a black shirt and a chain around his neck
150
- t-shirt,man wearing white crew-neck t-shirts
151
- t-shirt,a woman sitting on the floor with her legs crossed
152
- t-shirt,man in white crew neck t-shirt
153
- t-shirt,smiling woman in black and white print t-shirt
154
- t-shirt,woman in black crew neck t-shirt and blue denim jeans
155
- t-shirt,photo of blue crew-neck tops
156
- t-shirt,a woman in a white shirt and a pink hat
157
- t-shirt,woman in black crew neck t-shirt standing near painting
158
- t-shirt,men's white crew-neck t-shirt
159
- t-shirt,man wearing white crew-neck t-shirt
160
- t-shirt,man in pink crew neck t-shirt wearing black sunglasses
161
- t-shirt,man in black crew neck t-shirt standing during daytime
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Textual Dataset Generation/Dataset/textual_dataset.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
Textual Dataset Generation/main.py DELETED
@@ -1,256 +0,0 @@
1
- import json
2
- import pandas as pd
3
-
4
- color_changes = [
5
- ("black", "Change * color to black"),
6
- ("white", "Change * color to white"),
7
- ("red", "Change * color to red"),
8
- ("blue", "Change * color to blue"),
9
- ("orange", "Change * color to orange"),
10
- ("purple", "Change * color to purple"),
11
- ]
12
-
13
- scene_changes = [
14
- ("by the beach", "put * by the beach"),
15
- ("in the desert", "put * in the desert"),
16
- ("* in a festival", "put it in a festival"),
17
- ("on a table", "put * on a table"),
18
- ("in a forest", "put * in a forest"),
19
- ("on a roof", "put * on a roof"),
20
- ("in the air", "make * in the air"),
21
- ("in hands", "put * in someone`s hands"),
22
- ("covered with water", "put * in water"),
23
- ]
24
-
25
- season_changes = [
26
- ("in snow", "add snow effects"),
27
- ("in spring", "add spring effects"),
28
- ("in summer", "add summer effects"),
29
- ("in autumn", "add autumn effects"),
30
- ("during daytime", "make the scene in daytime"),
31
- ("during nighttime", "make the period in nighttime"),
32
- ]
33
-
34
- background_changes = [
35
- ("in front of a mountain", "add mountain in the background"),
36
- ("in front of the pyramids", "add the pyramids in the background"),
37
- ]
38
-
39
- PRODUCTS = [
40
- "t-shirt",
41
- "shirt",
42
- "dress",
43
- "pants",
44
- "shoe",
45
- "watch",
46
- "vase",
47
- "sneaker",
48
- "headphone",
49
- "bottle",
50
- "perfume",
51
- "vase",
52
- "cup",
53
- "camera",
54
- "phone",
55
- "mobile",
56
- "bag",
57
- "earpod",
58
- "earbud",
59
- "heel",
60
- ]
61
-
62
-
63
- def find_words(lst, input):
64
- found_words = []
65
- for w in lst:
66
- indx = input.find(w)
67
- if indx != -1:
68
- if indx + len(w) + 1 < len(input) and input[indx + len(w) + 1] == "s":
69
- w += "s"
70
-
71
- found_words.append(w)
72
-
73
- return found_words
74
-
75
-
76
- def perform_color_changes(input, mixed_color=False):
77
- if input.endswith("s"):
78
- input = input[:-1]
79
-
80
- original_input = input
81
- result = []
82
- found_words = find_words(PRODUCTS, input)
83
-
84
- # remove colors to avoid repeated consecutive colors
85
- for change, _ in color_changes:
86
- changee = " "
87
- if mixed_color == True:
88
- changee = change + "-"
89
-
90
- input = input.replace(str(" " + change + " "), changee).replace(" ", " ")
91
-
92
- for word in found_words:
93
- for change, change_text in color_changes:
94
- edit = change_text.replace("*", word)
95
- changed = str(change + " " + word)
96
- output = input.replace(word, changed)
97
-
98
- result.append({"caption": original_input, "edit": edit, "output": output})
99
-
100
- return result
101
-
102
-
103
- def perform_scene_changes(input):
104
- original_input = input
105
- for change, _ in scene_changes:
106
- input = input.replace(change, "")
107
-
108
- result = []
109
- found_words = find_words(PRODUCTS, input)
110
- for word in found_words:
111
- for change, change_text in scene_changes:
112
- edit = change_text
113
- edit = edit.replace("*", word)
114
-
115
- output = input + " " + change
116
-
117
- result.append(
118
- {
119
- "caption": original_input,
120
- "edit": edit,
121
- "output": output.replace(" ", " "),
122
- }
123
- )
124
-
125
- return result
126
-
127
-
128
- def perform_affects_changes(input):
129
- original_input = input
130
- for change, _ in background_changes:
131
- input = input.replace(change, "")
132
-
133
- result = []
134
- found_words = find_words(PRODUCTS, input)
135
-
136
- for _ in found_words:
137
- for change, change_text in season_changes:
138
- edit = change_text
139
- output = input + " " + change
140
-
141
- result.append(
142
- {
143
- "caption": original_input,
144
- "edit": edit,
145
- "output": output.replace(" ", " "),
146
- }
147
- )
148
-
149
- break
150
- return result
151
-
152
-
153
- def perform_background_changes(input):
154
- original_input = input
155
- for change, _ in background_changes:
156
- input = input.replace(change, "")
157
-
158
- result = []
159
- found_words = find_words(PRODUCTS, input)
160
-
161
- for _ in found_words:
162
- for change, change_text in background_changes:
163
- edit = change_text
164
- output = input + " " + change
165
-
166
- result.append(
167
- {
168
- "caption": original_input,
169
- "edit": edit,
170
- "output": output.replace(" ", " "),
171
- }
172
- )
173
- break
174
-
175
- return result
176
-
177
-
178
- def get_rows(input): # return: list of dictionary
179
- input = input.lower().replace("-", " ")
180
- # remove brands
181
- input = input.replace("nike ", "")
182
- result = []
183
-
184
- color_changes_result = perform_color_changes(input, False)
185
- for new_row in color_changes_result:
186
- result.append(new_row)
187
-
188
- scene_changes_result = perform_scene_changes(input)
189
- for new_row in scene_changes_result:
190
- result.append(new_row)
191
-
192
- affects_changes_result = perform_affects_changes(input)
193
- for new_row in affects_changes_result:
194
- result.append(new_row)
195
-
196
- background_changes_result = perform_background_changes(input)
197
- for new_row in background_changes_result:
198
- result.append(new_row)
199
-
200
- return result
201
-
202
-
203
- def statistics(strs):
204
- total = 0
205
- stat = {}
206
- for product in PRODUCTS:
207
- cntr = sum(product.lower() in s.lower() for s in strs)
208
- total += cntr
209
- stat[product] = cntr
210
-
211
- stat = dict(sorted(stat.items(), key=lambda item: item[1], reverse=True))
212
-
213
- return stat, total
214
-
215
-
216
- def load_dataset(path):
217
-
218
- dataset = pd.read_csv(path)
219
- return dataset
220
-
221
-
222
- def save_dataset(data, file_path):
223
- with open(file_path, "w") as file:
224
- for idx in range(len(data)):
225
- json_string = json.dumps(data[idx]) + ("\n" if idx < len(data) - 1 else "")
226
- file.write(json_string)
227
-
228
-
229
- def insert_row(df, new_data: list):
230
- # Create a DataFrame with the new row data
231
- new_data = pd.DataFrame(new_data, columns=df.columns)
232
- # Append the new row to the existing DataFrame
233
- df = df.append(new_data, ignore_index=True)
234
- return df
235
-
236
-
237
- def main():
238
- input_dataset = load_dataset("Dataset/input_dataset.csv")
239
- outout_dataset = []
240
-
241
- for val in input_dataset["caption"]:
242
- newData = get_rows(val)
243
- if len(newData) > 0:
244
- outout_dataset += newData
245
-
246
- stat, total = statistics([item["caption"] for item in outout_dataset])
247
- stat = pd.DataFrame([stat]).transpose()
248
-
249
- print(rf"TOTAL : {total}")
250
- print(stat)
251
-
252
- save_dataset(outout_dataset, "Dataset/textual_dataset.jsonl")
253
-
254
-
255
- if __name__ == "__main__":
256
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Textual Dataset Generation/web scrap.py DELETED
@@ -1,66 +0,0 @@
1
- import pandas as pd
2
- import requests
3
- from bs4 import BeautifulSoup
4
- import time
5
-
6
-
7
- def scrap(url: str):
8
- try:
9
- response = requests.get(url)
10
- response.raise_for_status() # Raise an error for bad status codes
11
- except requests.exceptions.RequestException as e:
12
- print(f"Error fetching {url}: {e}")
13
- return None
14
-
15
- # Parse the HTML content
16
- soup = BeautifulSoup(response.content, "html.parser")
17
-
18
- div_elements = soup.find_all("div", class_="WxXog")
19
- result = []
20
- for div in div_elements:
21
- img_tag = div.find("img")
22
- if img_tag:
23
- alt_text = img_tag.get("alt")
24
- if alt_text is not None and len(alt_text) < 100:
25
- result.append(alt_text)
26
-
27
- return result
28
-
29
-
30
- def main():
31
- data = []
32
-
33
- categories = [
34
- "shoes",
35
- "sneakers",
36
- "heels",
37
- "watches",
38
- "pants",
39
- "clothing",
40
- "dress",
41
- "shirt",
42
- "t-shirt",
43
- ]
44
- seen_inputs = set()
45
- base_url = "https://unsplash.com/s/photos/"
46
-
47
- for category in categories:
48
- url = f"{base_url}{category}"
49
- print(f"Scraping {url}...")
50
- res = scrap(url)
51
- if res is not None:
52
- for t in res:
53
- if t not in seen_inputs:
54
- data.append({"category": category, "caption": t})
55
- seen_inputs.add(t)
56
-
57
- time.sleep(1) # Delay to avoid rate limiting
58
-
59
- df = pd.DataFrame(data)
60
-
61
- # Save to CSV
62
- df.to_csv("input_dataset.csv", index=False)
63
-
64
-
65
- if __name__ == "__main__":
66
- main()