import csv
import random
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 图片使其支持中文
li = [] # 原始数据
m_li = [] # 处理后数据
# 读取元素数据
with open('昆明.csv', encoding="UTF-8") as f:
message = csv.reader(f)
for i, row in enumerate(message):
li.append(row)
# 处理元素数据
# eg['商业类', '呈贡', 9800]
for row in li:
try:
price = int(row[3])
if price <= 20000:
addr = row[2].split(' ')
t = [row[1], addr[0], price]
m_li.append(t)
except:
continue
del li # 删除内存中的元素数据
center1 = 7000 # 聚类1中心
center2 = 10000 # 聚类2中心
center3 = 13000 # 聚类3中心
center4 = 16000 # 聚类3中心
change = True # 聚类中心是否发生改变
cluster1 = [] # 聚类1内容
cluster2 = [] # 聚类2内容
cluster3 = [] # 聚类3内容
cluster4 = [] # 聚类3内容
center1_list = [center1] # 聚类1中心历史值
center2_list = [center2] # 聚类2中心历史值
center3_list = [center3] # 聚类3中心历史值
center4_list = [center4] # 聚类3中心历史值
# ----------------------------聚类分析------------------------------------------------------------
for m in m_li:
# 计算初始所在聚类
distance = [abs(m[2] - center1), abs(m[2] - center2), abs(m[2] - center3), abs(m[2] - center4)]
min_dis = distance.index(min(distance))
if min_dis == 0:
cluster1.append(m)
elif min_dis == 1:
cluster2.append(m)
elif min_dis == 2:
cluster3.append(m)
else:
cluster4.append(m)
# print("3",cluster3)
# print("4",cluster4)
# print(len(cluster4))
while change:
# 计算聚类中心
sum_c = 0
for c1 in cluster1:
sum_c += c1[2]
center1 = int(sum_c / len(cluster1))
sum_c = 0
for c2 in cluster2:
sum_c += c2[2]
center2 = int(sum_c / len(cluster2))
sum_c = 0
for c3 in cluster3:
sum_c += c3[2]
center3 = int(sum_c / len(cluster3))
sum_c = 0
for c4 in cluster4:
sum_c += c4[2]
center4 = int(sum_c / len(cluster4))
# 添加入聚类中心历史值列表
center1_list.append(center1)
center2_list.append(center2)
center3_list.append(center3)
center4_list.append(center4)
change = False # 聚类中心已改变
# 处理第一类中数据
count = 0
del_list = []
for c1 in cluster1:
if abs(c1[2] - center2) < abs(c1[2] - center1):
change = True
cluster2.append(c1)
del_list.append(count)
elif abs(c1[2] - center3) < abs(c1[2] - center1):
change = True
cluster3.append(c1)
del_list.append(count)
elif abs(c1[2] - center4) < abs(c1[2] - center1):
change = True
cluster4.append(c1)
del_list.append(count)
count += 1
del_list.sort()
del_list.reverse()
for i in del_list:
del cluster1[i]
# 处理第二类中数据
count = 0
del_list = []
for c2 in cluster2:
if abs(c2[2] - center1) < abs(c2[2] - center2):
change = True
cluster1.append(c2)
del_list.append(count)
elif abs(c2[2] - center3) < abs(c2[2] - center2):
change = True
cluster3.append(c2)
del_list.append(count)
elif abs(c2[2] - center4) < abs(c2[2] - center2):
change = True
cluster4.append(c2)
del_list.append(count)
count += 1
del_list.sort()
del_list.reverse()
for i in del_list:
del cluster2[i]
# 处理第三类中数据
count = 0
del_list = []
for c3 in cluster3:
if abs(c3[2] - center1) < abs(c3[2] - center3):
change = True
cluster1.append(c3)
del_list.append(count)
elif abs(c3[2] - center2) < abs(c3[2] - center3):
change = True
cluster2.append(c3)
del_list.append(count)
elif abs(c3[2] - center4) < abs(c3[2] - center3):
change = True
cluster4.append(c3)
del_list.append(count)
count += 1
del_list.sort()
del_list.reverse()
for i in del_list:
del cluster3[i]
# 处理第四类中数据
count = 0
del_list = []
for c4 in cluster4:
if abs(c4[2] - center1) < abs(c4[2] - center4):
change = True
cluster1.append(c4)
del_list.append(count)
elif abs(c4[2] - center2) < abs(c4[2] - center4):
change = True
cluster2.append(c4)
del_list.append(count)
elif abs(c4[2] - center3) < abs(c4[2] - center4):
change = True
cluster3.append(c4)
del_list.append(count)
count += 1
del_list.sort()
del_list.reverse()
for i in del_list:
del cluster4[i]
time = len(center1_list) # 聚类中心更新次数
time_list = []
for i in range(0, time):
time_list.append(i) # x轴坐标值
# ----------------------聚类图表打印--------------------------
# 绘制聚类中心值变化曲线
plt.plot(time_list, center1_list, c='r')
plt.plot(time_list, center2_list, c='g')
plt.plot(time_list, center3_list, c='b')
plt.plot(time_list, center4_list, c='k')
plt.ylim(bottom=5)
plt.title("中心值变化情况")
plt.xlabel('聚类次数')
plt.ylabel('中心价格')
plt.plot([time - 1, time - 1], [0, center4_list[-1]], 'k:', linewidth=3)
plt.scatter(time - 1, center1_list[-1], s=20, color='r')
plt.scatter(time - 1, center2_list[-1], s=20, color='g')
plt.scatter(time - 1, center3_list[-1], s=20, color='b')
plt.scatter(time - 1, center4_list[-1], s=20, color='k')
plt.annotate(center1_list[-1], xy=(time - 1, center1_list[-1]), xytext=(time - 1, center1_list[-1] + 1000))
plt.annotate(center2_list[-1], xy=(time - 1, center2_list[-1]), xytext=(time - 1, center2_list[-1] + 1000))
plt.annotate(center3_list[-1], xy=(time - 1, center3_list[-1]), xytext=(time - 1, center3_list[-1] + 1000))
plt.annotate(center4_list[-1], xy=(time - 1, center4_list[-1]), xytext=(time - 1, center4_list[-1] + 1000))
plt.show()
#print(cluster1)
# 绘制聚类中元素个数柱状图
name_list = [1, 2, 3, 4]
num_list = [len(cluster1), len(cluster2), len(cluster3), len(cluster4)]
plt.ylim(top=300)
plt.ylabel("数量") # y轴标签
plt.xlabel("聚类")
for x, y in zip(name_list, num_list):
plt.text(x, y + 10, y, ha='center')
plt.bar(name_list, num_list, width=0.6, facecolor='#9999ff')
plt.xticks((1, 2, 3, 4), ('聚类1', '聚类2', '聚类3', '聚类4'))
plt.show()
def sang(cluster):
residence_x = []
residence_y = []
for cl1 in cluster:
if cl1[0] == '住宅':
residence_x.append(cl1[2])
residence_y.append(random.randint(5, 31))
elif cl1[0] == '别墅':
residence_x.append(cl1[2])
residence_y.append(random.randint(35, 61))
else:
residence_x.append(cl1[2])
residence_y.append(random.randint(65, 91))
return residence_x,residence_y
# 绘制散点图
reside_x,reside_y = sang(cluster1)
plt.scatter(reside_x, reside_y, c='r')
reside_x,reside_y = sang(cluster2)
plt.scatter(reside_x, reside_y, c='g')
reside_x,reside_y= sang(cluster3)
plt.scatter(reside_x, reside_y, c='b')
reside_x,reside_y= sang(cluster4)
plt.scatter(reside_x, reside_y, c='y')
plt.title("昆明市房价分析") # 设置标题
plt.yticks(()) # 不显示y轴数值
plt.show()
# -----------------------------------聚类细化分析---------------------------------------
# 商业类 底商 写字楼
for i in range(1, len(cluster1)):
if cluster1[i][0][0] == '商' or cluster1[i][0][1] == '商' or cluster1[i][0][0] == '写':
cluster1[i][0] = '商业类'
for i in range(1, len(cluster2)):
if cluster2[i][0][0] == '商' or cluster2[i][0][1] == '商' or cluster2[1][0][0] == '写':
cluster2[i][0] = '商业类'
for i in range(1, len(cluster3)):
if cluster3[i][0][0] == '商' or cluster3[i][0][1] == '商' or cluster3[i][0][0] == '写':
cluster3[i][0] = '商业类'
for i in range(1, len(cluster4)):
if cluster4[i][0][0] == '商' or cluster4[i][0][1] == '商' or cluster4[i][0][0] == '写':
cluster4[i][0] = '商业类'
# 0住宅 1别墅 2商业类
place1 = [[], [], []]
place2 = [[], [], []]
place3 = [[], [], []]
place4 = [[], [], []]
for clu in cluster1:
if clu[0] == "住宅":
place1[0].append(clu[1])
elif clu[0] == "别墅":
place1[1].append(clu[1])
else:
place1[2].append(clu[1])
for clu in cluster2:
if clu[0] == "住宅":
place2[0].append(clu[1])
elif clu[0] == "别墅":
place2[1].append(clu[1])
else:
place2[2].append(clu[1])
for clu in cluster3:
if clu[0] == "住宅":
place3[0].append(clu[1])
elif clu[0] == "别墅":
place3[1].append(clu[1])
else:
place3[2].append(clu[1])
for clu in cluster4:
if clu[0] == "住宅":
place4[0].append(clu[1])
elif clu[0] == "别墅":
place4[1].append(clu[1])
else:
place4[2].append(clu[1])
# 绘制聚类中元素个数柱状图
plt.figure(figsize=(12, 12))
name_list = [1, 2, 3]
num_list = [len(place1[0]), len(place1[1]), len(place1[2])]
plt.subplot(221)
plt.ylim(top=150)
plt.ylabel("数量") # y轴标签
plt.xlabel("聚类一")
for x, y in zip(name_list, num_list):
plt.text(x, y + 10, y, ha='center')
plt.bar(name_list, num_list, width=0.6, facecolor='#FFD3C7')
plt.xticks((1, 2, 3), ('基本住宅', '别墅', '商业类'))
# 聚类二
name_list = [1, 2, 3]
num_list = [len(place2[0]), len(place2[1]), len(place2[2])]
plt.subplot(222)
plt.ylim(top=150)
plt.ylabel("数量") # y轴标签
plt.xlabel("聚类二")
for x, y in zip(name_list, num_list):
plt.text(x, y + 10, y, ha='center')
plt.bar(name_list, num_list, width=0.6, facecolor='#FFD3C7')
plt.xticks((1, 2, 3), ('基本住宅', '别墅', '商业类'))
# 聚类三
name_list = [1, 2, 3]
num_list = [len(place3[0]), len(place3[1]), len(place3[2])]
plt.subplot(223)
plt.ylim(top=150)
plt.ylabel("数量") # y轴标签
plt.xlabel("聚类三")
for x, y in zip(name_list, num_list):
plt.text(x, y + 10, y, ha='center')
plt.bar(name_list, num_list, width=0.6, facecolor='#FFD3C7')
plt.xticks((1, 2, 3), ('基本住宅', '别墅', '商业类'))
# 聚类四
name_list = [1, 2, 3]
num_list = [len(place4[0]), len(place4[1]), len(place4[2])]
plt.subplot(224)
plt.ylim(top=150)
plt.ylabel("数量") # y轴标签
plt.xlabel("聚类四")
for x, y in zip(name_list, num_list):
plt.text(x, y + 10, y, ha='center')
plt.bar(name_list, num_list, width=0.6, facecolor='#FFD3C7')
plt.xticks((1, 2, 3), ('基本住宅', '别墅', '商业类'))
plt.show()
# -----------------------------------------------------------
for str, place in zip(["\n\n聚类一", "\n\n聚类二", "\n\n聚类三", "\n\n聚类四"], [place1, place2, place3, place4]):
print(str)
print("基本住宅:")
for i in list(dict.fromkeys(place[0])):
print(i, end=" ")
print("\n别墅:")
for i in list(dict.fromkeys(place[1])):
print(i, end=" ")
print("\n商业类:")
for i in list(dict.fromkeys(place[2])):
print(i, end=" ")
Views: 128