Import bibliotek do analizy danych

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

Załadowanie zbioru danych

In [3]:
file_path = 'NY-House-Dataset.csv'
ny_housing_data = pd.read_csv(file_path)

Wyświetlenie pierwszych danych z datasetu

In [None]:
print(ny_housing_data.head())

Sprawdzenie datasetu pod kątem brakujących wartości

In [None]:
print(ny_housing_data.isnull().sum())

Podstawowa wizualizacja danych

In [None]:
# Price Distribution - Using a Boxplot for a better understanding of outliers
sns.set(style='whitegrid')
plt.figure(figsize=(10, 6))
sns.boxplot(x=ny_housing_data['PRICE'])
plt.title('Price Distribution - Boxplot')
plt.xlabel('Price')
plt.show()

In [None]:
# Price Distribution - Histogram with a log scale
plt.figure(figsize=(10, 6))
sns.histplot(ny_housing_data['PRICE'], kde=True, log_scale=True)
plt.title('Price Distribution - Histogram (Log Scale)')
plt.xlabel('Price (Log Scale)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Property Type Distribution
plt.figure(figsize=(10, 6))
sns.countplot(y=ny_housing_data['TYPE'])
plt.title('Property Type Distribution')
plt.xlabel('Count')
plt.ylabel('Property Type')
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 8))
corr_matrix = ny_housing_data[['PRICE', 'BEDS', 'BATH', 'PROPERTYSQFT']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Scatter Plot - Price vs Property Size
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PROPERTYSQFT', y='PRICE', data=ny_housing_data, alpha=0.6)
plt.title('Price vs Property Size')
plt.xlabel('Property Size (sqft)')
plt.ylabel('Price (in millions)')
plt.show()

Import biblioteki sklearn (do uczenia maszynowego)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

Podział zbioru danych na dane treningowe i dane testowe

In [None]:
X = ny_housing_data[['PRICE', 'BEDS', 'BATH', 'PROPERTYSQFT']] # Example features
y = ny_housing_data['TYPE'] # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Utworzenie modelu Uczenia Maszynowego - drzewo decyzyjne

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

Wykonanie "predykcji"

In [None]:
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')

Wizualizacja "tablicy pomyłek" (confusion matrix)

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()