Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- create_embeddings.py +51 -0
- create_embeddings2.py +58 -0
- extract_variables.py +36 -0
- paragraphs2.py +0 -0
- variables.py +159 -0
create_embeddings.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pinecone
|
| 2 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 3 |
+
from paragraphs2 import podcasts
|
| 4 |
+
import uuid
|
| 5 |
+
google_embeddings = GoogleGenerativeAIEmbeddings(
|
| 6 |
+
model="models/embedding-001", # Correct model name
|
| 7 |
+
google_api_key="AIzaSyANNRKfEb-YnVIBaSAq6hQ38XpxxGwvaws" # Your API key
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
# Initialize Pinecone instance
|
| 11 |
+
pc = pinecone.Pinecone(
|
| 12 |
+
api_key="4a80f293-ae6d-489a-a7d8-33ea3fcdd26b" # Your Pinecone API key
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
# Define the Pinecone index name (make sure it exists in your Pinecone dashboard)
|
| 16 |
+
index_name = "iocl2"
|
| 17 |
+
index = pc.Index(index_name)
|
| 18 |
+
|
| 19 |
+
def create_embedding(variable):
|
| 20 |
+
try:
|
| 21 |
+
content=variable.get("description")
|
| 22 |
+
url=variable.get("url")
|
| 23 |
+
tag=variable.get("tag")
|
| 24 |
+
updated_url=""
|
| 25 |
+
if isinstance(url,list):
|
| 26 |
+
updated_url=",".join(url)
|
| 27 |
+
else:
|
| 28 |
+
updated_url=url
|
| 29 |
+
|
| 30 |
+
embedding=google_embeddings.embed_query(content)
|
| 31 |
+
vectors=[]
|
| 32 |
+
vectors.append({
|
| 33 |
+
'id': str(uuid.uuid4()),
|
| 34 |
+
'values': embedding,
|
| 35 |
+
'metadata': {
|
| 36 |
+
'chunk': content,
|
| 37 |
+
"url":updated_url,
|
| 38 |
+
"tag":tag
|
| 39 |
+
}
|
| 40 |
+
})
|
| 41 |
+
index.upsert(vectors)
|
| 42 |
+
print(f"inserted : {updated_url}")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"error occured {e}")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
create_embedding(podcasts)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
create_embeddings2.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pinecone
|
| 2 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 3 |
+
from variables import variables # Import the list of variable names
|
| 4 |
+
import uuid
|
| 5 |
+
|
| 6 |
+
# Initialize Google Embeddings
|
| 7 |
+
google_embeddings = GoogleGenerativeAIEmbeddings(
|
| 8 |
+
model="models/embedding-001", # Correct model name
|
| 9 |
+
google_api_key="AIzaSyANNRKfEb-YnVIBaSAq6hQ38XpxxGwvaws" # Your API key
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
# Initialize Pinecone instance
|
| 13 |
+
pc = pinecone.Pinecone(
|
| 14 |
+
api_key="4a80f293-ae6d-489a-a7d8-33ea3fcdd26b" # Your Pinecone API key
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# Define the Pinecone index name (make sure it exists in your Pinecone dashboard)
|
| 18 |
+
index_name = "iocl2"
|
| 19 |
+
index = pc.Index(index_name)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def create_embedding(variable):
|
| 23 |
+
try:
|
| 24 |
+
content = variable.get("description", None)
|
| 25 |
+
url = variable.get("url", "")
|
| 26 |
+
tag = variable.get("tag", "")
|
| 27 |
+
updated_url = ""
|
| 28 |
+
|
| 29 |
+
if isinstance(url, list):
|
| 30 |
+
updated_url = ",".join(url)
|
| 31 |
+
else:
|
| 32 |
+
updated_url = url
|
| 33 |
+
|
| 34 |
+
embedding = google_embeddings.embed_query(content)
|
| 35 |
+
vectors = []
|
| 36 |
+
vectors.append({
|
| 37 |
+
'id': str(uuid.uuid4()),
|
| 38 |
+
'values': embedding,
|
| 39 |
+
'metadata': {
|
| 40 |
+
'chunk': content,
|
| 41 |
+
"url": updated_url,
|
| 42 |
+
"tag": tag
|
| 43 |
+
}
|
| 44 |
+
})
|
| 45 |
+
index.upsert(vectors)
|
| 46 |
+
print(f"Inserted the chunk: {updated_url}")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Error occurred: {e}")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# Iterate over the variable names and create embeddings
|
| 52 |
+
for variable_name in variables:
|
| 53 |
+
# Dynamically import the variable from paragraphs2
|
| 54 |
+
variable = __import__('paragraphs2', fromlist=[variable_name])
|
| 55 |
+
variable_data = getattr(variable, variable_name)
|
| 56 |
+
print(f"trying to create embedding for {variable}")
|
| 57 |
+
# Call the create_embedding function with the variable data
|
| 58 |
+
create_embedding(variable_data)
|
extract_variables.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ast
|
| 2 |
+
|
| 3 |
+
def get_dict_variable_names(file_path):
|
| 4 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 5 |
+
node = ast.parse(file.read(), filename=file_path)
|
| 6 |
+
|
| 7 |
+
dict_variable_names = []
|
| 8 |
+
|
| 9 |
+
class DictVariableVisitor(ast.NodeVisitor):
|
| 10 |
+
def visit_Assign(self, node):
|
| 11 |
+
for target in node.targets:
|
| 12 |
+
if isinstance(target, ast.Name):
|
| 13 |
+
# Check if the assigned value is a dictionary
|
| 14 |
+
if isinstance(node.value, ast.Dict):
|
| 15 |
+
dict_variable_names.append(target.id)
|
| 16 |
+
self.generic_visit(node)
|
| 17 |
+
|
| 18 |
+
DictVariableVisitor().visit(node)
|
| 19 |
+
|
| 20 |
+
return dict_variable_names
|
| 21 |
+
|
| 22 |
+
def write_variables_to_file(variables, output_file):
|
| 23 |
+
with open(output_file, 'w', encoding='utf-8') as file:
|
| 24 |
+
file.write("variables = [\n")
|
| 25 |
+
for var in variables:
|
| 26 |
+
file.write(f" '{var}',\n")
|
| 27 |
+
file.write("]\n")
|
| 28 |
+
|
| 29 |
+
# Example usage
|
| 30 |
+
input_file_path = 'paragraphs2.py' # Replace with your input file path
|
| 31 |
+
output_file_path = 'variables.py' # Replace with your desired output file path
|
| 32 |
+
|
| 33 |
+
dict_variables = get_dict_variable_names(input_file_path)
|
| 34 |
+
write_variables_to_file(dict_variables, output_file_path)
|
| 35 |
+
|
| 36 |
+
print(f"Initialized dictionary variables stored in {output_file_path}.")
|
paragraphs2.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
variables.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
variables = [
|
| 2 |
+
'about_iocl',
|
| 3 |
+
'awards',
|
| 4 |
+
'corporate_logo',
|
| 5 |
+
'vision_and_values',
|
| 6 |
+
'company_leaders',
|
| 7 |
+
'company_mascot',
|
| 8 |
+
'refining_overview',
|
| 9 |
+
'installed_refinery_capacities',
|
| 10 |
+
'bongaigaon_refinery',
|
| 11 |
+
'barauni_refinery',
|
| 12 |
+
'pradip_refinery',
|
| 13 |
+
'haldia_refinery',
|
| 14 |
+
'mathura_refinery',
|
| 15 |
+
'gujarat_refinery',
|
| 16 |
+
'panipat_refinery',
|
| 17 |
+
'guwahati_refinery',
|
| 18 |
+
'digboi_refinery',
|
| 19 |
+
'pipeline_overview',
|
| 20 |
+
'pipelines_under_implementation',
|
| 21 |
+
'crude_oil_pipelines',
|
| 22 |
+
'petroleum_pipelines',
|
| 23 |
+
'gas_pipelines',
|
| 24 |
+
'pipeline_safety_overview',
|
| 25 |
+
'pipeline_identification',
|
| 26 |
+
'pipeline_emergency_contact',
|
| 27 |
+
'pipeline_leak_response',
|
| 28 |
+
'pipeline_protection',
|
| 29 |
+
'pipeline_crossing_portal',
|
| 30 |
+
'rd_overiview_and_achievements',
|
| 31 |
+
'lubricants',
|
| 32 |
+
'refining_tech_innovations',
|
| 33 |
+
'petrochemicals_polymer_pipeline_maintenance',
|
| 34 |
+
'fuel_additives_and_additional_energy',
|
| 35 |
+
'bioenergy_nano_tech',
|
| 36 |
+
'marketing',
|
| 37 |
+
'pump_locator',
|
| 38 |
+
'petrochemical_overview',
|
| 39 |
+
'petrochemical_strategic_businessUnit_and_achievements',
|
| 40 |
+
'petrochemical_plants',
|
| 41 |
+
'petrochemical_contact',
|
| 42 |
+
'natural_gas',
|
| 43 |
+
'natural_contact',
|
| 44 |
+
'natural_gas_brochure',
|
| 45 |
+
'cgd',
|
| 46 |
+
'eAndp',
|
| 47 |
+
'eAndpContact',
|
| 48 |
+
'iocl_explosives_business_overview',
|
| 49 |
+
'indogel_explosive_brand',
|
| 50 |
+
'explosive_safety_effeciency',
|
| 51 |
+
'current_explosive_plants_and_future_plans',
|
| 52 |
+
'explosive_contact',
|
| 53 |
+
'cryogenics_overview',
|
| 54 |
+
'cryo_product_range',
|
| 55 |
+
'cryo_certifications',
|
| 56 |
+
'cryocan',
|
| 57 |
+
'cryovessel',
|
| 58 |
+
'special_cryo_project',
|
| 59 |
+
'pressure_vessels_cryo',
|
| 60 |
+
'cryo_aviation',
|
| 61 |
+
'iocl_offices',
|
| 62 |
+
'indian_subsidries',
|
| 63 |
+
'foreign_subsidries',
|
| 64 |
+
'joint_ventures',
|
| 65 |
+
'sri_lanka',
|
| 66 |
+
'mauritus',
|
| 67 |
+
'middle_east',
|
| 68 |
+
'chennai',
|
| 69 |
+
'refinery_upcoming_projects',
|
| 70 |
+
'pipeline_projects',
|
| 71 |
+
'cgd_projects',
|
| 72 |
+
'marketin_projects',
|
| 73 |
+
'podcasts',
|
| 74 |
+
'iocl_revenue',
|
| 75 |
+
'xp95',
|
| 76 |
+
'xtraGreen',
|
| 77 |
+
'cng',
|
| 78 |
+
'cng_price',
|
| 79 |
+
'gasoline',
|
| 80 |
+
'high_speed_diesel',
|
| 81 |
+
'xp100',
|
| 82 |
+
'swagat',
|
| 83 |
+
'autogas',
|
| 84 |
+
'fuel_testing',
|
| 85 |
+
'xtrapower_program',
|
| 86 |
+
'xtrarewards',
|
| 87 |
+
'servo_lubricant',
|
| 88 |
+
'lubes_contact',
|
| 89 |
+
'automtive_lubricating_oil',
|
| 90 |
+
'png',
|
| 91 |
+
'png_urls',
|
| 92 |
+
'indane',
|
| 93 |
+
'indane_price',
|
| 94 |
+
'chotu_gas',
|
| 95 |
+
'composite_cylinder',
|
| 96 |
+
'munna_cylinder',
|
| 97 |
+
'kersone',
|
| 98 |
+
'non_fuel_products',
|
| 99 |
+
'surya_nutan',
|
| 100 |
+
'commercial_indane',
|
| 101 |
+
'industrial_png',
|
| 102 |
+
'bulk_fuel',
|
| 103 |
+
'fuel_call',
|
| 104 |
+
'aviation_fuel',
|
| 105 |
+
'aviation_contact',
|
| 106 |
+
'avgas',
|
| 107 |
+
'marine_oils',
|
| 108 |
+
'bitumen',
|
| 109 |
+
'agri_spray_oils',
|
| 110 |
+
'industrial_greases',
|
| 111 |
+
'industrial_lubes',
|
| 112 |
+
'industrial_speciality_oil',
|
| 113 |
+
'metal_working_oil',
|
| 114 |
+
'railroad_grease',
|
| 115 |
+
'non_fuel_alliances',
|
| 116 |
+
'non_pds_keosene',
|
| 117 |
+
'glycols',
|
| 118 |
+
'lab',
|
| 119 |
+
'polymers',
|
| 120 |
+
'pta',
|
| 121 |
+
'special_products',
|
| 122 |
+
'benzene',
|
| 123 |
+
'cbfs',
|
| 124 |
+
'food_grade_hexane',
|
| 125 |
+
'jute_batching_oil',
|
| 126 |
+
'paraffin',
|
| 127 |
+
'propylene',
|
| 128 |
+
'tech_for_licensing',
|
| 129 |
+
'cgm_contact',
|
| 130 |
+
'csr',
|
| 131 |
+
'iocl_foundation',
|
| 132 |
+
'sustainability',
|
| 133 |
+
'iocl_suppliers',
|
| 134 |
+
'iocl_sports',
|
| 135 |
+
'sports_values',
|
| 136 |
+
'sports_legacy',
|
| 137 |
+
'sport_scholarship',
|
| 138 |
+
'sport_equality',
|
| 139 |
+
'family_sport',
|
| 140 |
+
'news',
|
| 141 |
+
'petrol_diesel_price',
|
| 142 |
+
'india_energy_week',
|
| 143 |
+
'iocl_iim',
|
| 144 |
+
'iocl_careers',
|
| 145 |
+
'net_zero',
|
| 146 |
+
'env_management',
|
| 147 |
+
'pollution_control',
|
| 148 |
+
'air_poll',
|
| 149 |
+
'solid_waste',
|
| 150 |
+
'oil_spill',
|
| 151 |
+
'noise_poll',
|
| 152 |
+
'green_belt',
|
| 153 |
+
'energy_effecient',
|
| 154 |
+
'satat_scheme',
|
| 155 |
+
'safety',
|
| 156 |
+
'occupational_health',
|
| 157 |
+
'green_fuel',
|
| 158 |
+
'contact_iocl',
|
| 159 |
+
]
|