Ashar086 commited on
Commit
e1dd5ab
·
verified ·
1 Parent(s): 15fd8dd

Create modules/data_ingestion.py

Browse files
Files changed (1) hide show
  1. modules/data_ingestion.py +64 -0
modules/data_ingestion.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from llama_index import GPTVectorStoreIndex, Document
3
+ from typing import Union, List
4
+ import json
5
+
6
+ class DataIngestionModule:
7
+ def __init__(self):
8
+ self.supported_formats = {
9
+ 'csv': pd.read_csv,
10
+ 'xlsx': pd.read_excel,
11
+ 'json': pd.read_json
12
+ }
13
+
14
+ def load_data(self, file) -> pd.DataFrame:
15
+ """Load data from various file formats"""
16
+ file_extension = file.name.split('.')[-1].lower()
17
+
18
+ if file_extension not in self.supported_formats:
19
+ raise ValueError(f"Unsupported file format: {file_extension}")
20
+
21
+ return self.supported_formats[file_extension](file)
22
+
23
+ def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
24
+ """Preprocess the dataframe"""
25
+ # Remove duplicate rows
26
+ df = df.drop_duplicates()
27
+
28
+ # Handle missing values
29
+ df = df.fillna('')
30
+
31
+ # Convert all text columns to string
32
+ text_columns = df.select_dtypes(include=['object']).columns
33
+ for col in text_columns:
34
+ df[col] = df[col].astype(str)
35
+
36
+ return df
37
+
38
+ def index_data(self, df: pd.DataFrame) -> GPTVectorStoreIndex:
39
+ """Create a LlamaIndex index from the dataframe"""
40
+ # Preprocess the data
41
+ processed_df = self.preprocess_data(df)
42
+
43
+ # Convert DataFrame rows to documents
44
+ documents = []
45
+ for _, row in processed_df.iterrows():
46
+ # Combine all columns into a single text document
47
+ text = " ".join([f"{col}: {val}" for col, val in row.items()])
48
+ documents.append(Document(text))
49
+
50
+ # Create and return the index
51
+ return GPTVectorStoreIndex.from_documents(documents)
52
+
53
+ def export_processed_data(self, df: pd.DataFrame, format: str, path: str):
54
+ """Export processed data to specified format"""
55
+ processed_df = self.preprocess_data(df)
56
+
57
+ if format == 'csv':
58
+ processed_df.to_csv(path, index=False)
59
+ elif format == 'json':
60
+ processed_df.to_json(path, orient='records')
61
+ elif format == 'xlsx':
62
+ processed_df.to_excel(path, index=False)
63
+ else:
64
+ raise ValueError(f"Unsupported export format: {format}")