import pandas as pd                                                 # For data storage and transformation
import numpy as np                                                  # Also for data storage and transformation
from urllib.request import urlopen                                  # For downloading the dataset
from io import BytesIO                                              # For reading and working with the scraped ZIP file
from zipfile import ZipFile                                         # For unzipping the scraped ZIP file
import seaborn as sns                                               # For plotting graphs
import matplotlib.pyplot as plt                                     # Also for plotting graphs
from sklearn import decomposition, preprocessing, model_selection   # For performing statistical analysis and modeling
import statsmodels.api as sm                                        # Also For performing statistical analysis modeling
from statsmodels.tools import eval_measures                         # Also For performing statistical analysis
import folium                                                       # For plotting on a map
import folium.plugins as plugins                                    # For additional features when plotting on a map


# Check out https://svaderia.github.io/articles/downloading-and-unzipping-a-zipfile/ for an in depth view on unzipping files in Python
url = "https://geodacenter.github.io/data-and-lab//data/kingcounty.zip"

with urlopen(url) as resp:
    with ZipFile(BytesIO(resp.read())) as z:
        z.extract(member="kingcounty/kc_house_data.csv")
        z.close()

df = pd.read_csv("kingcounty/kc_house_data.csv")
df = df.sort_values(by="date", axis=0)
df


df.count()

id               21613
date             21613
price            21613
bedrooms         21613
bathrooms        21613
sqft_living      21613
sqft_lot         21613
floors           21613
waterfront       21613
view             21613
condition        21613
grade            21613
sqft_above       21613
sqft_basement    21613
yr_built         21613
yr_renovated     21613
zipcode          21613
lat              21613
long             21613
sqft_living15    21613
sqft_lot15       21613
dtype: int64


df.describe()


df[df.bedrooms == 33]


df = df.drop(df[df.bedrooms == 33].index)
df = df.drop(df[df.bathrooms == 0].index)

df.describe()


df = df.drop_duplicates(subset="id",keep="last")

df.describe()


df["date"] = pd.to_datetime(df.date)
df["year"] = df.date.dt.year
df["month"] = df.date.dt.month
df["day"] = df.date.dt.day
df["day_of_week"] = df.date.dt.day_name()
df = df.drop("date", axis=1)

df["is_renovated"] = np.where(df.yr_renovated == 0, 0, 1)
df["yr_renovated"] = df.yr_renovated.replace(to_replace=0, value=np.nan)

df


plt.figure(figsize=(10,4))

price_dist = sns.distplot(df["price"])
price_dist.set(xlabel="Price in Millions", title="Price Density of Houses in King's County")
plt.show()


plt.figure(figsize=(10,4))

logged_price_dist = sns.distplot(np.log(df["price"]))
logged_price_dist.set(xlabel="Log Price in Millions", title="Log Price Density of Houses in King's County")
plt.show()


# Creating groups
month_group = df.groupby(by=["month"])
day_group = df.groupby(by=["day_of_week"])


plt.figure(figsize=(10,4))

month_freq = month_group.size()
month_freq_plot = sns.barplot(month_freq.index, month_freq.values)
month_freq_plot.set(xlabel="Month", ylabel="Count", title="Frequency of House Purchases by Month")
plt.show()


plt.figure(figsize=(10,4))

month_price_plot = sns.violinplot(x="month", y="price", data=df)
month_price_plot.set(xlabel="Month", ylabel="Price in Dollars", title="Distribution of Price of House Purchases by Month")
plt.show()


plt.figure(figsize=(10,4))

month_price = month_group["price"].mean()
month_avg_price_plot = sns.barplot(month_price.index, month_price.values)
month_avg_price_plot.set(xlabel="Month", ylabel="Average Price in Dollars", title="Average Price of House Purchases by Month")
plt.show()


# Creating order list for use in ordering the bar plots

day_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]


plt.figure(figsize=(10,4))

day_freq = day_group.size()
day_freq_plot = sns.barplot(x=day_freq.index, y=day_freq.values, order=day_order)
day_freq_plot.set(xlabel="Day of Week", ylabel="Count", title="Frequency of House Purchases by Day of Week")
plt.show()


plt.figure(figsize=(10,4))

day_price_plot = sns.violinplot(x="day_of_week", y="price", data=df, order=day_order)
day_price_plot.set(xlabel="Day of Week", ylabel="Price in Millions of Dollars", title="Distribution of Price of House Purchases by Day of Week")
plt.show()


plt.figure(figsize=(10,4))

day_price = day_group["price"].mean()
day_avg_price_plot = sns.barplot(x=day_price.index, y=day_price.values, order=day_order)
day_avg_price_plot.set(xlabel="Day of Week", ylabel="Average Price in Dollars", title="Average Price of House Purchases by Day of Week")
plt.show()


# Floor frequency

plt.figure(figsize=(10,4))

floor_group = df.groupby(by=["floors"])
floor_freq = floor_group.size()
floor_freq_plot = sns.barplot(floor_freq.index, floor_freq.values)
floor_freq_plot.set(xlabel="Number of Floors", ylabel="Count", title="Frequency of House Purchases by Number of Floors")
plt.show()


# Floor price distribution

plt.figure(figsize=(10,4))

floor_price_plot = sns.violinplot(x="floors", y="price", data=df)
floor_price_plot.set(xlabel="Number of Floors", ylabel="Price in Millions of Dollars", title="Distribution of Price of House Purchases by Number of Floors")
plt.show()


# Average prices

plt.figure(figsize=(10,4))

floor_price = floor_group["price"].mean()
floor_avg_price_plot = sns.barplot(x=floor_price.index, y=floor_price.values)
floor_avg_price_plot.set(xlabel="Number of Floors", ylabel="Average Price in Millions of Dollars", title="Average Price of House Purchases by Number of Floors")
plt.show()


# Bedroom frequency

plt.figure(figsize=(10,4))

bed_group = df.groupby(by=["bedrooms"])
bed_freq = bed_group.size()
bed_freq_plot = sns.barplot(bed_freq.index, bed_freq.values)
bed_freq_plot.set(xlabel="Number of Bedrooms", ylabel="Count", title="Frequency of House Purchases by Number of Bedrooms")
plt.show()


# Bedroom price distribution

plt.figure(figsize=(10,4))

bed_price_plot = sns.violinplot(x="bedrooms", y="price", data=df)
bed_price_plot.set(xlabel="Number of Bedrooms", ylabel="Price in Millions of Dollars", title="Distribution of Price of House Purchases by Number of Bedrooms")
plt.show()


# Average prices

plt.figure(figsize=(10,4))

bed_price = bed_group["price"].mean()
bed_avg_price_plot = sns.barplot(x=bed_price.index, y=bed_price.values)
bed_avg_price_plot.set(xlabel="Number of Bedrooms", ylabel="Average Price in Millions of Dollars", title="Average Price of House Purchases by Number of Bedrooms")
plt.show()


# Bathroom frequency

plt.figure(figsize=(14,4))

bath_group = df.groupby(by=["bathrooms"])
bath_freq = bath_group.size()
bath_freq_plot = sns.barplot(bath_freq.index, bath_freq.values)
bath_freq_plot.set(xlabel="Number of Bathrooms", ylabel="Count", title="Frequency of House Purchases by Number of Bathrooms")
plt.show()


# Bathroom price distribution

plt.figure(figsize=(14,4))

bath_price_plot = sns.violinplot(x="bathrooms", y="price", data=df)
bath_price_plot.set(xlabel="Number of Bathrooms", ylabel="Price in Millions of Dollars", title="Distribution of Price of House Purchases by Number of Bathrooms")
plt.show()


# Average prices

plt.figure(figsize=(14,4))

bath_price = bath_group["price"].mean()
bath_avg_price_plot = sns.barplot(x=bath_price.index, y=bath_price.values)
bath_avg_price_plot.set(xlabel="Number of Bathrooms", ylabel="Average Price in Millions of Dollars", title="Average Price of House Purchases by Number of Bathrooms")
plt.show()


renovate_group = df.groupby(by=["is_renovated"])
renovate_freq = renovate_group.size()
renovate_freq_plot = sns.barplot(renovate_freq.index, renovate_freq.values)
renovate_freq_plot.set(xlabel="Renovated or not (1 for renovated)", ylabel="Count", title="Renovated")
plt.show()


# Distribution of years passed

plt.figure(figsize=(10,4))

renovated = df[df["is_renovated"] == 1]
ren_year_dist = sns.distplot(renovated["yr_renovated"] - renovated["yr_built"])
ren_year_dist.set(xlabel="Years Between Renovation and Built", title="Renovation - Built Density of Houses in King's County")
plt.show()


# Distribution of condition

plt.figure(figsize=(10,4))

cond_dist = sns.distplot(df["condition"], kde=False)
cond_dist.set(xlabel="Condition", title="Condition Distribution of Houses in King's County")
plt.show()


# Distribution of grade

plt.figure(figsize=(10,4))

cond_dist = sns.distplot(df["grade"], kde=False)
cond_dist.set(xlabel="Grade", title="Grade Distribution of Houses in King's County")
plt.show()


# Determine price bucket, return integer indicating which bucket
def get_bucket(price):
    if price < 100000:      # Lower
        return 0
    elif price < 300000:    # Lower-med
        return 1
    elif price < 600000:    # Med
        return 2
    elif price < 1000000:   # Upper-med
        return 3
    else:                   # Upper
        return 4

# Given a bucket number, produce the color for use in map
def get_color(bucket):
    if bucket == 0:
        return "red"
    elif bucket == 1:
        return "orange"
    elif bucket == 2:
        return "green"
    elif bucket == 3:
        return "blue"
    else:
        return "purple"


# Filter map 

m = folium.Map(location=[47.5480, -121.9750])

s = df.sample(n=5000)

group1 = plugins.FeatureGroupSubGroup(m, 'Condition 1')
group2 = plugins.FeatureGroupSubGroup(m, 'Condition 2')
group3 = plugins.FeatureGroupSubGroup(m, 'Condition 3')
group4 = plugins.FeatureGroupSubGroup(m, 'Condition 4')
group5 = plugins.FeatureGroupSubGroup(m, 'Condition 5')

m.add_child(group1)
m.add_child(group2)
m.add_child(group3)
m.add_child(group4)
m.add_child(group5)

for j, row in s.iterrows():
    price = row["price"]
    price_bucket = get_bucket(price)
    marker = folium.CircleMarker(row[["lat","long"]],
        radius=5,
        color=get_color(price_bucket),
        popup=price)

    cond = row["condition"]
    if cond == 1:
        group1.add_child(marker)
    elif cond == 2:
        group2.add_child(marker)
    elif cond == 3:
        group3.add_child(marker)
    elif cond == 4:
        group4.add_child(marker)
    else:
        group5.add_child(marker)

folium.LayerControl().add_to(m)

m


# Overall price distribution

plt.figure(figsize=(14,4))

cond_price_plot = sns.violinplot(x="condition", y="price", data=df)
cond_price_plot.set(xlabel="Condition of House", ylabel="Price in Millions of Dollars", title="Distribution of Price of House Purchases by House Condition")
plt.show()


# Average prices

plt.figure(figsize=(14,4))

cond_group = df.groupby(by=["condition"])
cond_price = cond_group["price"].mean()
cond_avg_price_plot = sns.barplot(x=cond_price.index, y=cond_price.values)
cond_avg_price_plot.set(xlabel="Condition of House", ylabel="Average Price in Dollars", title="Average Price of House Purchases by House Condition")
plt.show()


# Overall price distribution

plt.figure(figsize=(14,4))

water_price_plot = sns.violinplot(x="waterfront", y="price", data=df)
water_price_plot.set(xlabel="Presence of Waterfront (0 for no waterfront, 1 for waterfront)", ylabel="Price in Millions of Dollars", title="Distribution of Price of House Purchases by Presence of Waterfront")
plt.show()


# Average prices

plt.figure(figsize=(14,4))

water_group = df.groupby(by=["waterfront"])
water_price = water_group["price"].mean()
water_avg_price_plot = sns.barplot(x=water_price.index, y=water_price.values)
water_avg_price_plot.set(xlabel="Presence of Waterfront (0 for no waterfront, 1 for waterfront)", ylabel="Average Price in Dollars", title="Average Price of House Purchases by Presence of Waterfront")
plt.show()


X = df.drop(["id","day_of_week", "yr_renovated", "is_renovated", "waterfront","day", "month", "lat", "long", "zipcode", "year"], axis=1)
X


# Check out https://towardsdatascience.com/annotated-heatmaps-in-5-simple-steps-cc2a0660a27d to learn more about this code!
f, ax = plt.subplots(figsize=(16, 16)) 

corrMatrix = X.drop("price",axis=1).corr()
mask = np.zeros_like(corrMatrix, dtype=np.bool)
mask[np.triu_indices_from(mask)]= True
sns.heatmap(corrMatrix, 
            mask=mask,
            square = True,
            linewidths = .5,
            cmap = "coolwarm",
            cbar_kws = {'shrink': .4, 
                    "ticks" : [-1, -.5, 0, 0.5, 1]},
            vmin = -1, 
            vmax = 1,
            annot = True,
            annot_kws = {"size": 12},
            fmt = ".2f")

ax.set_yticklabels(corrMatrix.columns, rotation = 0)
ax.set_xticklabels(corrMatrix.columns)
plt.show()


X_ = X.drop(["price"], axis=1)
x = preprocessing.StandardScaler().fit_transform(X_.values)

pca = decomposition.PCA(n_components=.85)
components = pca.fit_transform(x)
pca_df = pd.DataFrame(data = components, columns=["PC"+str(i) for i in range(1, components.shape[1]+1) ])
pca_df


f, ax = plt.subplots(figsize=(20, 10)) 

sns.heatmap(pca.components_.T, 
                cmap = "coolwarm", 
                yticklabels=X.drop("price", axis=1).columns,
                xticklabels=[ "PCA"+str(x) for x in range(1,components.shape[1]+1)],
                annot = True,
                fmt = ".2f")
plt.show()


X_pca = pca_df
y = df["price"].rename("Log of Actual Price")

X_pca_train, X_pca_test, y_pca_train, y_pca_test = model_selection.train_test_split(X_pca, y)

y_pca_train = y_pca_train.transform(np.log)
y_pca_test = y_pca_test.transform(np.log)

X_pca_train = sm.add_constant(X_pca_train)
X_pca_test = sm.add_constant(X_pca_test)

# Linear Regression on Principal Components
pca_est = sm.OLS(list(y_pca_train), X_pca_train).fit()
print("Using PCA:")
print(pca_est.summary())
y_pca_pred = pca_est.predict(X_pca_test)
y_pca_pred = y_pca_pred.rename("Log of Predicted Price")

Using PCA:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.578
Model:                            OLS   Adj. R-squared:                  0.578
Method:                 Least Squares   F-statistic:                     3143.
Date:                Mon, 21 Dec 2020   Prob (F-statistic):               0.00
Time:                        15:33:52   Log-Likelihood:                -5402.1
No. Observations:               16068   AIC:                         1.082e+04
Df Residuals:                   16060   BIC:                         1.088e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.0541      0.003   4884.215      0.000      13.049      13.059
PC1            0.1592      0.001    132.455      0.000       0.157       0.162
PC2            0.0891      0.002     45.654      0.000       0.085       0.093
PC3           -0.0519      0.002    -24.876      0.000      -0.056      -0.048
PC4            0.0629      0.003     22.643      0.000       0.057       0.068
PC5           -0.0979      0.003    -33.245      0.000      -0.104      -0.092
PC6           -0.0502      0.003    -14.955      0.000      -0.057      -0.044
PC7           -0.0340      0.003     -9.713      0.000      -0.041      -0.027
==============================================================================
Omnibus:                       42.428   Durbin-Watson:                   2.026
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               34.655
Skew:                          -0.037   Prob(JB):                     2.98e-08
Kurtosis:                       2.785   Cond. No.                         2.91
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# Print the P-value for each coefficient
print(pca_est.summary2().tables[1]['P>|t|'])

const     0.000000e+00
PC1       0.000000e+00
PC2       0.000000e+00
PC3      4.608429e-134
PC4      9.091084e-112
PC5      1.967913e-234
PC6       3.141864e-50
PC7       3.050806e-22
Name: P>|t|, dtype: float64


# Print the root mean square error
print("Root Mean Square Error: ", eval_measures.rmse(y_pca_test, y_pca_pred))

Root Mean Square Error:  0.34035121878667896


# Plot distribution of predictions
f, ax = plt.subplots(figsize=(20, 10)) 
sns.distplot(y_pca_test, hist=True, label="Actual Price")
sns.distplot(y_pca_pred, hist=True, label="Predicted Price")
plt.xlabel("Price (Logarithmic Scale)")
plt.legend()
plt.show()


# Plot prediction versus real value scatterplot
f, ax = plt.subplots(figsize=(20, 10)) 
sns.regplot(x=y_pca_test, y =y_pca_pred)
plt.show()

Library	Documentation
pandas	https://pandas.pydata.org/
numpy	https://numpy.org/doc/
urllib	https://docs.python.org/3/library/urllib.html
io	https://docs.python.org/3/library/io.html
zipfile	https://docs.python.org/3/library/zipfile.html
seaborn	https://seaborn.pydata.org/
matplotlib	https://matplotlib.org/3.3.3/contents.html
sklearn	https://scikit-learn.org/stable/
statsmodel.api	https://www.statsmodels.org/stable/api.html

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
16768	5561000190	20140502T000000	437500.0	3	2.25	1970	35100	2.0	0	0	...	9	1970	0	1977	0	98027	47.4635	-121.991	2340	35100
9596	472000620	20140502T000000	790000.0	3	2.50	2600	4750	1.0	0	0	...	9	1700	900	1951	0	98117	47.6833	-122.400	2380	4750
9587	1024069009	20140502T000000	675000.0	5	2.50	2820	67518	2.0	0	0	...	8	2820	0	1979	0	98029	47.5794	-122.025	2820	48351
20602	7853361370	20140502T000000	555000.0	4	2.50	3310	6500	2.0	0	0	...	8	3310	0	2012	0	98065	47.5150	-121.870	2380	5000
11577	5056500260	20140502T000000	440000.0	4	2.25	2160	8119	1.0	0	0	...	8	1080	1080	1966	0	98006	47.5443	-122.177	1850	9000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7898	1422700040	20150514T000000	183000.0	3	1.00	1170	7320	1.0	0	0	...	7	1170	0	1962	0	98188	47.4685	-122.282	2040	7320
928	8730000270	20150514T000000	359000.0	2	2.75	1370	1140	2.0	0	0	...	8	1080	290	2009	0	98133	47.7052	-122.343	1370	1090
5637	7923600250	20150515T000000	450000.0	5	2.00	1870	7344	1.5	0	0	...	7	1870	0	1960	0	98007	47.5951	-122.144	1870	7650
13053	5101400871	20150524T000000	445500.0	2	1.75	1390	6670	1.0	0	0	...	6	720	670	1941	0	98115	47.6914	-122.308	920	6380
16594	9106000005	20150527T000000	1310000.0	4	2.25	3750	5000	2.0	0	0	...	8	2440	1310	1924	0	98115	47.6747	-122.303	2170	4590

Variable	Description
id	Unique identifier of the house
date	Date of sale
price	Sell price
bedrooms	Number of bedrooms
bathrooms	Number of bathrooms. Noninteger values exist due to "1/2 bathrooms" and "3/4 bathrooms"
sqft_liv	Size of interior space in square feet
sqft_lot	Size of land lot in square feet
floors	Number of floors. Noninteger values exist due to "half floor" architecture
waterfront	'1' if property has a waterfront, '0' if not
view	An index from 0 to 4 of how good the property's view is
condition	Condition of the house, ranked from 1 to 5, 5 being the greatest condition
grade	Classification by construction material and worksmanship quality. Numeric scale with higher numbers being better. For more information see the King County glossary
sqft_above	Square feet above ground
sqft_below	Square feet below ground
yr_built	Year built
yr_renov	Year renovated. '0' if never renovated
zipcode	5 digit zip code
lat	Latitude
long	Longitude
squft_liv15	Average size of interior space for closest 15 houses, in square feet
squft_lot15	Average size of land lot for closest 15 houses, in square feet

	id	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
count	2.161300e+04	2.161300e+04	21613.000000	21613.000000	21613.000000	2.161300e+04	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000
mean	4.580302e+09	5.400881e+05	3.370842	2.114757	2079.899736	1.510697e+04	1.494309	0.007542	0.234303	3.409430	7.656873	1788.390691	291.509045	1971.005136	84.402258	98077.939805	47.560053	-122.213896	1986.552492	12768.455652
std	2.876566e+09	3.671272e+05	0.930062	0.770163	918.440897	4.142051e+04	0.539989	0.086517	0.766318	0.650743	1.175459	828.090978	442.575043	29.373411	401.679240	53.505026	0.138564	0.140828	685.391304	27304.179631
min	1.000102e+06	7.500000e+04	0.000000	0.000000	290.000000	5.200000e+02	1.000000	0.000000	0.000000	1.000000	1.000000	290.000000	0.000000	1900.000000	0.000000	98001.000000	47.155900	-122.519000	399.000000	651.000000
25%	2.123049e+09	3.219500e+05	3.000000	1.750000	1427.000000	5.040000e+03	1.000000	0.000000	0.000000	3.000000	7.000000	1190.000000	0.000000	1951.000000	0.000000	98033.000000	47.471000	-122.328000	1490.000000	5100.000000
50%	3.904930e+09	4.500000e+05	3.000000	2.250000	1910.000000	7.618000e+03	1.500000	0.000000	0.000000	3.000000	7.000000	1560.000000	0.000000	1975.000000	0.000000	98065.000000	47.571800	-122.230000	1840.000000	7620.000000
75%	7.308900e+09	6.450000e+05	4.000000	2.500000	2550.000000	1.068800e+04	2.000000	0.000000	0.000000	4.000000	8.000000	2210.000000	560.000000	1997.000000	0.000000	98118.000000	47.678000	-122.125000	2360.000000	10083.000000
max	9.900000e+09	7.700000e+06	33.000000	8.000000	13540.000000	1.651359e+06	3.500000	1.000000	4.000000	5.000000	13.000000	9410.000000	4820.000000	2015.000000	2015.000000	98199.000000	47.777600	-121.315000	6210.000000	871200.000000

	id	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
count	2.160200e+04	2.160200e+04	21602.000000	21602.000000	21602.000000	2.160200e+04	21602.000000	21602.000000	21602.000000	21602.000000	21602.000000	21602.000000	21602.000000	21602.000000	21602.000000	21602.000000	21602.000000	21602.000000	21602.000000	21602.000000
mean	4.580335e+09	5.401261e+05	3.370892	2.115753	2080.126146	1.510684e+04	1.494213	0.007546	0.234238	3.409592	7.657532	1788.495510	291.630636	1971.006157	84.445237	98077.950236	47.560065	-122.213967	1986.623414	12766.736923
std	2.876737e+09	3.671168e+05	0.905733	0.769010	918.161484	4.143004e+04	0.539761	0.086539	0.766309	0.650481	1.173611	827.771283	442.638912	29.374639	401.776985	53.509608	0.138556	0.140749	685.166307	27309.454207
min	1.000102e+06	7.800000e+04	0.000000	0.500000	370.000000	5.200000e+02	1.000000	0.000000	0.000000	1.000000	3.000000	370.000000	0.000000	1900.000000	0.000000	98001.000000	47.155900	-122.519000	399.000000	651.000000
25%	2.123049e+09	3.220000e+05	3.000000	1.750000	1430.000000	5.040000e+03	1.000000	0.000000	0.000000	3.000000	7.000000	1190.000000	0.000000	1951.000000	0.000000	98033.000000	47.471025	-122.328000	1490.000000	5100.000000
50%	3.904930e+09	4.500000e+05	3.000000	2.250000	1910.000000	7.617500e+03	1.500000	0.000000	0.000000	3.000000	7.000000	1560.000000	0.000000	1975.000000	0.000000	98065.000000	47.571800	-122.231000	1840.000000	7620.000000
75%	7.308900e+09	6.450000e+05	4.000000	2.500000	2550.000000	1.068475e+04	2.000000	0.000000	0.000000	4.000000	8.000000	2210.000000	560.000000	1997.000000	0.000000	98118.000000	47.678000	-122.125000	2360.000000	10082.250000
max	9.900000e+09	7.700000e+06	11.000000	8.000000	13540.000000	1.651359e+06	3.500000	1.000000	4.000000	5.000000	13.000000	9410.000000	4820.000000	2015.000000	2015.000000	98199.000000	47.777600	-121.315000	6210.000000	871200.000000

Predicting House Prices in King County, Washington¶

John Luo, Sean Lin¶

Introduction¶

Required Tools¶

Data Collection¶

Data Preprocessing¶

Exploratory Data Analysis and Visualization¶

Summary Statistics¶

Correlation Analysis¶

Hypothesis Testing¶

Conclusion¶

	id	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
count	2.142500e+04	2.142500e+04	21425.000000	21425.000000	21425.000000	2.142500e+04	21425.000000	21425.000000	21425.000000	21425.000000	21425.000000	21425.000000	21425.000000	21425.000000	21425.000000	21425.000000	21425.000000	21425.000000	21425.000000	21425.000000
mean	4.580800e+09	5.416890e+05	3.371622	2.118355	2082.934656	1.513553e+04	1.496103	0.007608	0.235053	3.410548	7.662404	1791.067445	291.867211	1971.099510	84.773302	98077.872765	47.560169	-122.213768	1988.386791	12784.237340
std	2.876763e+09	3.673044e+05	0.904648	0.768746	918.864625	4.154826e+04	0.540159	0.086893	0.767084	0.649970	1.172388	828.704669	442.846391	29.386517	402.529730	53.473991	0.138594	0.140816	685.472311	27380.809425
min	1.000102e+06	7.800000e+04	0.000000	0.500000	370.000000	5.200000e+02	1.000000	0.000000	0.000000	1.000000	3.000000	370.000000	0.000000	1900.000000	0.000000	98001.000000	47.155900	-122.519000	399.000000	651.000000
25%	2.123050e+09	3.249000e+05	3.000000	1.750000	1430.000000	5.040000e+03	1.000000	0.000000	0.000000	3.000000	7.000000	1200.000000	0.000000	1952.000000	0.000000	98033.000000	47.471100	-122.328000	1490.000000	5100.000000
50%	3.904921e+09	4.500000e+05	3.000000	2.250000	1920.000000	7.614000e+03	1.500000	0.000000	0.000000	3.000000	7.000000	1560.000000	0.000000	1975.000000	0.000000	98065.000000	47.572000	-122.230000	1840.000000	7620.000000
75%	7.308900e+09	6.450000e+05	4.000000	2.500000	2550.000000	1.068900e+04	2.000000	0.000000	0.000000	4.000000	8.000000	2220.000000	560.000000	1997.000000	0.000000	98117.000000	47.678000	-122.125000	2370.000000	10086.000000
max	9.900000e+09	7.700000e+06	11.000000	8.000000	13540.000000	1.651359e+06	3.500000	1.000000	4.000000	5.000000	13.000000	9410.000000	4820.000000	2015.000000	2015.000000	98199.000000	47.777600	-121.315000	6210.000000	871200.000000

	PC1	PC2	PC3	PC4	PC5	PC6	PC7
0	0.834523	-0.330428	0.993173	-0.249437	-1.292710	0.457758	-0.634719
1	0.488472	1.538731	-0.942503	-0.225730	-0.027299	-0.213155	-1.466825
2	2.542715	-0.161191	1.533352	-1.003149	-0.048555	-0.694715	1.232852
3	2.449598	-1.480011	-0.143836	-0.470271	-0.155325	-0.360746	0.207684
4	-0.121114	1.101796	-0.801796	-0.274158	1.879202	-0.073200	-0.210351
...	...	...	...	...	...	...	...
21420	-1.954424	-0.391224	0.273912	0.327240	0.246814	-1.143089	-0.135827
21421	-0.486297	-1.831861	-0.036272	0.715680	0.713416	1.578398	-0.910509
21422	-0.093520	-0.362421	-0.297258	-0.902625	0.333950	-0.724831	1.646430
21423	-2.938580	0.393784	-0.090758	0.528558	1.246843	0.197309	-0.187743
21424	1.290570	2.551272	-1.620429	-1.400157	-1.192710	0.680239	-0.114833