43 lines
1.2 KiB
Python
43 lines
1.2 KiB
Python
|
import numpy as np
|
||
|
import matplotlib.pyplot as plt
|
||
|
import pandas as pd
|
||
|
|
||
|
# 引入資料集
|
||
|
dataset = pd.read_csv("Data.csv")
|
||
|
X = dataset.iloc[:, :-1].values
|
||
|
Y = dataset.iloc[:, 3].values
|
||
|
|
||
|
# 缺損資料處理
|
||
|
from sklearn.impute import SimpleImputer
|
||
|
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||
|
X[:, 1: 3] = imputer.fit_transform(X[:, 1:3])
|
||
|
|
||
|
# 分類資料的處理
|
||
|
from sklearn.preprocessing import LabelEncoder
|
||
|
labelencoder_X = LabelEncoder()
|
||
|
|
||
|
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
|
||
|
|
||
|
from sklearn.preprocessing import OneHotEncoder
|
||
|
from sklearn.compose import ColumnTransformer
|
||
|
|
||
|
ct = ColumnTransformer(transformers=[
|
||
|
('col-0', OneHotEncoder(), [0])
|
||
|
], remainder='passthrough')
|
||
|
X = np.array(ct.fit_transform(X), dtype=float)
|
||
|
|
||
|
labelencoder_Y = LabelEncoder()
|
||
|
Y = labelencoder_Y.fit_transform(Y)
|
||
|
|
||
|
# 將資料集分為訓練集和測試集
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
|
||
|
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
|
||
|
|
||
|
# 特徵縮放
|
||
|
from sklearn.preprocessing import StandardScaler
|
||
|
sc = StandardScaler()
|
||
|
X_train = sc.fit_transform(X_train)
|
||
|
X_test = sc.transform(X_test)
|
||
|
|
||
|
print(X_train)
|