/学习笔记 · 2023年 3月 30日 0

昆明房屋数据聚类分析

import csv
import random

import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams['font.sans-serif'] = ['SimHei']  # 图片使其支持中文
li = []  # 原始数据
m_li = []  # 处理后数据

# 读取元素数据
with open('昆明.csv', encoding="UTF-8") as f:
    message = csv.reader(f)
    for i, row in enumerate(message):
        li.append(row)

# 处理元素数据
# eg['商业类', '呈贡', 9800]
for row in li:
    try:
        price = int(row[3])

        if price <= 20000:
            addr = row[2].split(' ')
            t = [row[1], addr[0], price]
            m_li.append(t)
    except:
        continue

del li  # 删除内存中的元素数据
center1 = 7000  # 聚类1中心
center2 = 10000  # 聚类2中心
center3 = 13000  # 聚类3中心
center4 = 16000  # 聚类3中心

change = True  # 聚类中心是否发生改变

cluster1 = []  # 聚类1内容
cluster2 = []  # 聚类2内容
cluster3 = []  # 聚类3内容
cluster4 = []  # 聚类3内容

center1_list = [center1]  # 聚类1中心历史值
center2_list = [center2]  # 聚类2中心历史值
center3_list = [center3]  # 聚类3中心历史值
center4_list = [center4]  # 聚类3中心历史值

# ----------------------------聚类分析------------------------------------------------------------
for m in m_li:

    # 计算初始所在聚类
    distance = [abs(m[2] - center1), abs(m[2] - center2), abs(m[2] - center3), abs(m[2] - center4)]
    min_dis = distance.index(min(distance))
    if min_dis == 0:
        cluster1.append(m)
    elif min_dis == 1:
        cluster2.append(m)
    elif min_dis == 2:
        cluster3.append(m)
    else:
        cluster4.append(m)
# print("3",cluster3)
# print("4",cluster4)
# print(len(cluster4))
while change:
    # 计算聚类中心
    sum_c = 0
    for c1 in cluster1:
        sum_c += c1[2]
    center1 = int(sum_c / len(cluster1))

    sum_c = 0
    for c2 in cluster2:
        sum_c += c2[2]
    center2 = int(sum_c / len(cluster2))

    sum_c = 0
    for c3 in cluster3:
        sum_c += c3[2]
    center3 = int(sum_c / len(cluster3))

    sum_c = 0
    for c4 in cluster4:
        sum_c += c4[2]
    center4 = int(sum_c / len(cluster4))

    # 添加入聚类中心历史值列表
    center1_list.append(center1)
    center2_list.append(center2)
    center3_list.append(center3)
    center4_list.append(center4)

    change = False  # 聚类中心已改变

    # 处理第一类中数据
    count = 0
    del_list = []
    for c1 in cluster1:
        if abs(c1[2] - center2) < abs(c1[2] - center1):
            change = True
            cluster2.append(c1)
            del_list.append(count)
        elif abs(c1[2] - center3) < abs(c1[2] - center1):
            change = True
            cluster3.append(c1)
            del_list.append(count)
        elif abs(c1[2] - center4) < abs(c1[2] - center1):
            change = True
            cluster4.append(c1)
            del_list.append(count)
        count += 1
    del_list.sort()
    del_list.reverse()
    for i in del_list:
        del cluster1[i]

    # 处理第二类中数据
    count = 0
    del_list = []
    for c2 in cluster2:
        if abs(c2[2] - center1) < abs(c2[2] - center2):
            change = True
            cluster1.append(c2)
            del_list.append(count)
        elif abs(c2[2] - center3) < abs(c2[2] - center2):
            change = True
            cluster3.append(c2)
            del_list.append(count)
        elif abs(c2[2] - center4) < abs(c2[2] - center2):
            change = True
            cluster4.append(c2)
            del_list.append(count)
        count += 1
    del_list.sort()
    del_list.reverse()
    for i in del_list:
        del cluster2[i]

    # 处理第三类中数据
    count = 0
    del_list = []
    for c3 in cluster3:
        if abs(c3[2] - center1) < abs(c3[2] - center3):
            change = True
            cluster1.append(c3)
            del_list.append(count)
        elif abs(c3[2] - center2) < abs(c3[2] - center3):
            change = True
            cluster2.append(c3)
            del_list.append(count)
        elif abs(c3[2] - center4) < abs(c3[2] - center3):
            change = True
            cluster4.append(c3)
            del_list.append(count)
        count += 1
    del_list.sort()
    del_list.reverse()
    for i in del_list:
        del cluster3[i]

    # 处理第四类中数据
    count = 0
    del_list = []
    for c4 in cluster4:
        if abs(c4[2] - center1) < abs(c4[2] - center4):
            change = True
            cluster1.append(c4)
            del_list.append(count)
        elif abs(c4[2] - center2) < abs(c4[2] - center4):
            change = True
            cluster2.append(c4)
            del_list.append(count)
        elif abs(c4[2] - center3) < abs(c4[2] - center4):
            change = True
            cluster3.append(c4)
            del_list.append(count)
        count += 1
    del_list.sort()
    del_list.reverse()
    for i in del_list:
        del cluster4[i]

time = len(center1_list)  # 聚类中心更新次数
time_list = []
for i in range(0, time):
    time_list.append(i)  # x轴坐标值
# ----------------------聚类图表打印--------------------------
# 绘制聚类中心值变化曲线
plt.plot(time_list, center1_list, c='r')
plt.plot(time_list, center2_list, c='g')
plt.plot(time_list, center3_list, c='b')
plt.plot(time_list, center4_list, c='k')
plt.ylim(bottom=5)
plt.title("中心值变化情况")
plt.xlabel('聚类次数')
plt.ylabel('中心价格')
plt.plot([time - 1, time - 1], [0, center4_list[-1]], 'k:', linewidth=3)
plt.scatter(time - 1, center1_list[-1], s=20, color='r')
plt.scatter(time - 1, center2_list[-1], s=20, color='g')
plt.scatter(time - 1, center3_list[-1], s=20, color='b')
plt.scatter(time - 1, center4_list[-1], s=20, color='k')
plt.annotate(center1_list[-1], xy=(time - 1, center1_list[-1]), xytext=(time - 1, center1_list[-1] + 1000))
plt.annotate(center2_list[-1], xy=(time - 1, center2_list[-1]), xytext=(time - 1, center2_list[-1] + 1000))
plt.annotate(center3_list[-1], xy=(time - 1, center3_list[-1]), xytext=(time - 1, center3_list[-1] + 1000))
plt.annotate(center4_list[-1], xy=(time - 1, center4_list[-1]), xytext=(time - 1, center4_list[-1] + 1000))
plt.show()

#print(cluster1)
# 绘制聚类中元素个数柱状图
name_list = [1, 2, 3, 4]
num_list = [len(cluster1), len(cluster2), len(cluster3), len(cluster4)]
plt.ylim(top=300)
plt.ylabel("数量")  # y轴标签
plt.xlabel("聚类")
for x, y in zip(name_list, num_list):
    plt.text(x, y + 10, y, ha='center')
plt.bar(name_list, num_list, width=0.6, facecolor='#9999ff')
plt.xticks((1, 2, 3, 4), ('聚类1', '聚类2', '聚类3', '聚类4'))
plt.show()

def sang(cluster):
    residence_x = []
    residence_y = []

    for cl1 in cluster:
        if cl1[0] == '住宅':
            residence_x.append(cl1[2])
            residence_y.append(random.randint(5, 31))
        elif cl1[0] == '别墅':
            residence_x.append(cl1[2])
            residence_y.append(random.randint(35, 61))
        else:
            residence_x.append(cl1[2])
            residence_y.append(random.randint(65, 91))
    return residence_x,residence_y

# 绘制散点图
reside_x,reside_y = sang(cluster1)
plt.scatter(reside_x, reside_y, c='r')
reside_x,reside_y = sang(cluster2)
plt.scatter(reside_x, reside_y, c='g')
reside_x,reside_y= sang(cluster3)
plt.scatter(reside_x, reside_y, c='b')
reside_x,reside_y= sang(cluster4)
plt.scatter(reside_x, reside_y, c='y')
plt.title("昆明市房价分析")  # 设置标题
plt.yticks(())  # 不显示y轴数值
plt.show()
# -----------------------------------聚类细化分析---------------------------------------
# 商业类  底商  写字楼
for i in range(1, len(cluster1)):
    if cluster1[i][0][0] == '商' or cluster1[i][0][1] == '商' or cluster1[i][0][0] == '写':
        cluster1[i][0] = '商业类'
for i in range(1, len(cluster2)):
    if cluster2[i][0][0] == '商' or cluster2[i][0][1] == '商' or cluster2[1][0][0] == '写':
        cluster2[i][0] = '商业类'
for i in range(1, len(cluster3)):
    if cluster3[i][0][0] == '商' or cluster3[i][0][1] == '商' or cluster3[i][0][0] == '写':
        cluster3[i][0] = '商业类'
for i in range(1, len(cluster4)):
    if cluster4[i][0][0] == '商' or cluster4[i][0][1] == '商' or cluster4[i][0][0] == '写':
        cluster4[i][0] = '商业类'

# 0住宅 1别墅  2商业类
place1 = [[], [], []]
place2 = [[], [], []]
place3 = [[], [], []]
place4 = [[], [], []]
for clu in cluster1:
    if clu[0] == "住宅":
        place1[0].append(clu[1])
    elif clu[0] == "别墅":
        place1[1].append(clu[1])
    else:
        place1[2].append(clu[1])
for clu in cluster2:
    if clu[0] == "住宅":
        place2[0].append(clu[1])
    elif clu[0] == "别墅":
        place2[1].append(clu[1])
    else:
        place2[2].append(clu[1])
for clu in cluster3:
    if clu[0] == "住宅":
        place3[0].append(clu[1])
    elif clu[0] == "别墅":
        place3[1].append(clu[1])
    else:
        place3[2].append(clu[1])
for clu in cluster4:
    if clu[0] == "住宅":
        place4[0].append(clu[1])
    elif clu[0] == "别墅":
        place4[1].append(clu[1])
    else:
        place4[2].append(clu[1])

# 绘制聚类中元素个数柱状图
plt.figure(figsize=(12, 12))
name_list = [1, 2, 3]
num_list = [len(place1[0]), len(place1[1]), len(place1[2])]
plt.subplot(221)
plt.ylim(top=150)
plt.ylabel("数量")  # y轴标签
plt.xlabel("聚类一")
for x, y in zip(name_list, num_list):
    plt.text(x, y + 10, y, ha='center')
plt.bar(name_list, num_list, width=0.6, facecolor='#FFD3C7')
plt.xticks((1, 2, 3), ('基本住宅', '别墅', '商业类'))

# 聚类二
name_list = [1, 2, 3]
num_list = [len(place2[0]), len(place2[1]), len(place2[2])]
plt.subplot(222)
plt.ylim(top=150)
plt.ylabel("数量")  # y轴标签
plt.xlabel("聚类二")
for x, y in zip(name_list, num_list):
    plt.text(x, y + 10, y, ha='center')
plt.bar(name_list, num_list, width=0.6, facecolor='#FFD3C7')
plt.xticks((1, 2, 3), ('基本住宅', '别墅', '商业类'))

# 聚类三
name_list = [1, 2, 3]
num_list = [len(place3[0]), len(place3[1]), len(place3[2])]
plt.subplot(223)
plt.ylim(top=150)
plt.ylabel("数量")  # y轴标签
plt.xlabel("聚类三")
for x, y in zip(name_list, num_list):
    plt.text(x, y + 10, y, ha='center')
plt.bar(name_list, num_list, width=0.6, facecolor='#FFD3C7')
plt.xticks((1, 2, 3), ('基本住宅', '别墅', '商业类'))

# 聚类四
name_list = [1, 2, 3]
num_list = [len(place4[0]), len(place4[1]), len(place4[2])]
plt.subplot(224)
plt.ylim(top=150)
plt.ylabel("数量")  # y轴标签
plt.xlabel("聚类四")
for x, y in zip(name_list, num_list):
    plt.text(x, y + 10, y, ha='center')
plt.bar(name_list, num_list, width=0.6, facecolor='#FFD3C7')
plt.xticks((1, 2, 3), ('基本住宅', '别墅', '商业类'))
plt.show()
# -----------------------------------------------------------
for str, place in zip(["\n\n聚类一", "\n\n聚类二", "\n\n聚类三", "\n\n聚类四"], [place1, place2, place3, place4]):
    print(str)
    print("基本住宅:")
    for i in list(dict.fromkeys(place[0])):
        print(i, end=" ")
    print("\n别墅:")
    for i in list(dict.fromkeys(place[1])):
        print(i, end=" ")
    print("\n商业类:")
    for i in list(dict.fromkeys(place[2])):
        print(i, end=" ")