top of page

 Nashville Housing Data Cleaning Project

Overview and Purpose

In this Project i am working on a Nashville Housing datasets. I have downloaded this dataset from the internet I have done this project using Sql.

The purpose of this project is to clean the raw data and make the data ready in a structured form so that we can use it to find some useful insights out of it

Dataset

This dataset is about houses and buildings made for the purpose of living in Nashville which is one of the State of United States of America

To download the dataset and other related files click on the download button given below

  • Nashville Housing Data

Raw Data.png

Importing Dataset to Sql Server

Importing dataset using Sql Server Import and Export Wizard

Data Import.png

Data Cleaning Using Sql Queries

Download all Project related files by clicking the download button.

--Looking at the dataset using sql Query

select * from [NashvilleHousing]

Converting the data type of Column [Sale Date] to Date data type using Convert function.

-- (1) Standarize Date format

alter table NashvilleHousing 
add [Sale Date Converted] date;

update NashvilleHousing 
set [Sale Date Converted] = CONVERT(date, Saledate)

Here the column [Property address] had some null values. So i have filled the null values with the Property address value assigned to the corresponding Parcel Id as the Parcel Id was repeating.

-- (2)  Populate Property address data

select * from NashvilleHousing
where PropertyAddress is null


select PropertyAddress from [NashvilleHousing] 
where 
  PropertyAddress is not null 
group by 
  PropertyAddress

 


select 
  a.ParcelID, 
  a.PropertyAddress, 
  b.ParcelID, 
  b.PropertyAddress, 
  isnull(a.PropertyAddress, b.PropertyAddress) 
from 
  [NashvilleHousing] a 
  join [NashvilleHousing] b on a.ParcelID = b.ParcelID 
  and a.[UniqueID ] <> b.[UniqueID ] 
where 
  a.PropertyAddress is null

update
set propertyaddress = isnull(a.PropertyAddress, b.PropertyAddress) 
from 
  [NashvilleHousing] a 
  join [NashvilleHousing] b on a.ParcelID = b.ParcelID 
  and a.[UniqueID ] <> b.[UniqueID ] 
where 
  a.PropertyAddress is null


select * from NashvilleHousing

Splitting the column using Substring, Char index function.

--(3) Breaking out address into individual Columns (Address,City ,state)

--Breaking out Property Address using string,Charindex function

select 
  SUBSTRING(PropertyAddress, 1, CHARINDEX(',', propertyaddress)-1), 
  SUBSTRING(PropertyAddress, CHARINDEX(',', propertyaddress)+ 1, LEN(PropertyAddress)) 
from 
  [NashvilleHousing]

alter table NashvilleHousing
add [PropertySplitAddress] nvarchar(255);

update NashvilleHousing
set [PropertySplitAddress] = SUBSTRING(PropertyAddress,1,CHARINDEX(',',propertyaddress)-1)


alter table NashvilleHousing
add [PropertySplitCity] nvarchar(255);

update NashvilleHousing
set [PropertySplitCity] = SUBSTRING(PropertyAddress,CHARINDEX(',',propertyaddress)+1,LEN(PropertyAddress))

Splitting the column using Parsename function.

--Breaking out Owner Address using parsename function 

select 
  PARSENAME(replace(OwnerAddress, ',', '.'),3), 
  PARSENAME(replace(OwnerAddress, ',', '.'), 2), 
  PARSENAME(replace(OwnerAddress, ',', '.'), 1) 
from 
  NashvilleHousing


alter table NashvilleHousing 
add [ownerSplitAddress] nvarchar(255);

update NashvilleHousing
set [OwnerSplitAddress] = PARSENAME(replace(OwnerAddress,',','.'),3)

alter table NashvilleHousing 
add [OwnerSplitCity] nvarchar(255);

update NashvilleHousing
set [OwnerSplitCity] = PARSENAME(replace(OwnerAddress,',','.'),2)

alter table NashvilleHousing
add [ownerSplitState] nvarchar(255);

update NashvilleHousing
set [OwnerSplitState] = PARSENAME(replace(OwnerAddress,',','.'),1)

Replacing values using case statement.

-- (4) Change Y and N to Yes and No in 'Sold as Vacant' field


select 
  distinct(SoldAsVacant), 
  count(SoldAsVacant) 
from 
  NashvilleHousing 
group by 
  SoldAsVacant 
order by 
  2


select 
  SoldAsVacant, 
  case when SoldAsVacant = 'Y' then 'Yes' when SoldAsVacant = 'N' then 'No' else SoldAsVacant end as New 
from 
  NashvilleHousing

update NashvilleHousing 
set SoldAsVacant = case when SoldAsVacant = 'Y' then 'Yes' when SoldAsVacant = 'N' then 'No' else SoldAsVacant end

Removing Duplicate rows.

--(5) Removing Duplicate Rows


select * from NashvilleHousing

with DuplicateRow as (
  select *, ROW_NUMBER() over (partition by ParcelID, 
      PropertyAddress, 
      SaleDate, 
      SalePrice, 
      LegalReference 
      order by 
        UniqueId
    ) as Row_num 
  from 
    NashvilleHousing
)

select * from DuplicateRow 
where Row_num >1
order by UniqueId


Delete from DuplicateRow 
where Row_num >1

Removing Unused Columns.

--(6) Removing Unused Columns

select * from NashvilleHousing

alter table 
  NashvilleHousing 
drop column 
  OwnerAddress, 
  PropertyAddress, 
  TaxDistrict

  SaleDate


Renaming Columns

--(7) Renaming Column Name

select * from NashvilleHousing

sp_rename  'NashvilleHousing.PropertySplitAddress', 'PropertyAddress', 'Column'

sp_rename  'NashvilleHousing.PropertySplitCity', 'PropertyCity', 'Column'


sp_rename  'NashvilleHousing.ownerSplitAddress', 'OwnerAddress', 'Column'

sp_rename  'NashvilleHousing.ownerSplitCity', 'OwnerCity', 'Column'

sp_rename  'NashvilleHousing.OwnerSplitState', 'OwnerState', 'Column'

sp_rename  'NashvilleHousing.Sale Date Converted', 'Sale Date', 'Column'

Data After Cleaning

bottom of page