Prev: 乘车指南 🚇 Next: Jike Metro 🚇 中各个类的可用属性
import jike
c = jike.JikeClient()
# 瓦总
ceo_follower = c.get_user_follower(username='82D23B32-CF36-4C59-AD6F-D05E3552CBF3')
ceo_follower
ceo_follower.load_all()
# 不管姐
boss_follower = c.get_user_follower(username='B5C00109-15EA-4351-8B93-E58651E8C39D')
boss_follower.load_all()
ceo_male_fan_count = sum((follower.gender == 'MALE' for follower in ceo_follower))
ceo_female_fan_count = sum((follower.gender == 'FEMALE' for follower in ceo_follower))
ceo_other_fan_count = sum(follower.gender == None for follower in ceo_follower)
print(ceo_male_fan_count, ceo_female_fan_count, ceo_other_fan_count)
boss_male_fan_count = sum((follower.gender == 'MALE' for follower in boss_follower))
boss_female_fan_count = sum((follower.gender == 'FEMALE' for follower in boss_follower))
boss_other_fan_count = sum(follower.gender == None for follower in boss_follower)
print(boss_male_fan_count, boss_female_fan_count, boss_other_fan_count)
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
labels = 'male', 'female', 'unknown'
ceo_stats = (ceo_male_fan_count, ceo_female_fan_count, ceo_other_fan_count)
boss_stats = (boss_male_fan_count, boss_female_fan_count, boss_other_fan_count)
the_grid = GridSpec(1, 2)
plt.subplot(the_grid[0, 0], aspect=1)
plt.title('ceo')
plt.pie(ceo_stats, labels=labels, autopct='%1.1f%%', shadow=True)
plt.subplot(the_grid[0, 1], aspect=1)
plt.title('boss')
plt.pie(boss_stats, labels=labels, autopct='%1.1f%%', shadow=True)
# 不好笑便利店 的主题精选
selected = c.get_topic_selected(topic_id='5701d10d5002b912000e588d')
from datetime import datetime, timedelta
today = datetime.today()
a_month_ago = today - timedelta(days=30)
date_parse = lambda t: datetime.strptime(t[:-5], '%Y-%m-%dT%H:%M:%S')
comment_keywords = {'奶', '任务', '爱尔兰', '不动产', '发车', '开车', '上车', '窑子', '黄色', '黄即',
'片子', '看片', '借一部', '资源', '举报'}
from collections import defaultdict
time_periods = defaultdict(int)
message_date = today
while message_date > a_month_ago:
messages = selected.load_more(limit=100)
for message in messages:
message_date = date_parse(message.createdAt)
comments = c.get_comment(message)
comments.load_full()
for comment in comments:
if any((keyword in comment.content for keyword in comment_keywords)):
time_periods[message_date.hour] += 1
# UTC time, should +8 for Asia/Shanghai
adjusted_time_periods = [((h+8)%24, time_periods[h]) for h in range(24)]
adjusted_time_periods = adjusted_time_periods[-8:] + adjusted_time_periods[:-8]
adjusted_time_periods
店长最可能发车的三个时间段
total_cnt = sum((cnt for _, cnt in adjusted_time_periods))
drive_time = sorted(adjusted_time_periods, key=lambda t: t[1], reverse=True)[:3]
for period, cnt in drive_time:
print('发车时间: {}点,发车概率: {:.2%}'.format(period, cnt / total_cnt))
店长发车的时间分布
%matplotlib inline
import matplotlib.pyplot as plt
data = []
for h, cnt in adjusted_time_periods:
data.extend([h]*cnt)
plt.hist(data, max(data)-min(data), facecolor='g', align='left', histtype='bar', rwidth=0.9)
plt.xlabel('Hour of a day')
plt.ylabel('Drive count')
plt.title('Histogram of drive statistics')
plt.grid(True)
plt.show()
即刻Web端首页右侧栏的推荐关注
recommended_topics = c.get_recommended_topic()
加载至少200个推荐的主题
while len(recommended_topics) < 200:
recommended_topics.load_more()
recommended_topics
给我推荐的前五个主题的关键词
for topic in recommended_topics[:5]:
print(topic.keywords)
我自己关注的主题
my_subscribed_topics = c.get_user_subscribed_topic(username='WalleMax')
my_subscribed_topics.load_all()
自己关注的最近五个主题的关键词
for topic in my_subscribed_topics[:5]:
print(topic.keywords)
进行关键词计数
from collections import Counter
recommended_keywords_counter = Counter()
subscribed_keywords_counter = Counter()
for topic in recommended_topics:
recommended_keywords_counter.update(topic.keywords.split() if topic.keywords else [])
“推荐关注”一共有1164个关键词
len(recommended_keywords_counter)
“推荐关注”中出现频次最高的10个关键词及其对应频数
recommended_keywords_counter.most_common(10)
for topic in my_subscribed_topics:
subscribed_keywords_counter.update(topic.keywords.split() if topic.keywords else [])
“我关注的主题”一共有953个关键词
len(subscribed_keywords_counter)
“我关注的主题”中出现频次最高的10个关键词及其对应频数
subscribed_keywords_counter.most_common(10)
好奇,怎么会有 “哄妹子” 呢? 🤔🤔🤔
基于 Word Cloud 生成关键词词云
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from wordcloud import WordCloud
%matplotlib inline
mask = np.array(Image.open('jike.png'))
wc = WordCloud(background_color='white', max_words=100, mask=mask, width=1000, height=800,
relative_scaling=0.3, random_state=42,
font_path='/System/Library/Fonts/PingFang.ttc')
the_grid = GridSpec(1, 2)
wc.generate_from_frequencies(recommended_keywords_counter)
plt.subplot(the_grid[0, 0], aspect=1)
plt.title('Recommendation World Cloud')
plt.axis("off")
plt.imshow(wc, interpolation="bilinear")
wc.generate_from_frequencies(subscribed_keywords_counter)
plt.subplot(the_grid[0, 1], aspect=1)
plt.title('Subscription World Cloud')
plt.axis("off")
plt.imshow(wc, interpolation="bilinear")
我觉得吧,“推荐关注”词云里那么大的“鹿晗”,“李易峰”,“综艺”,“明星”和我的个人气质不大符合啊 🤔
即刻的推荐系统还有很大的进步空间
获取 我就想定个位 主题下所有的广场动态
square = c.get_topic_square(topic_id='5aaf50b9127e30001759c57e')
from collections import defaultdict
city_counter = defaultdict(int)
locations = []
记录动态的定位城市和定位经纬度
while square.load_more_key:
more = square.load_more()
for post in more:
if post.type == 'ORIGINAL_POST' and post.poi:
city_counter[post.poi['cityname']] += 1
locations.append(post.poi['location'])
此主题广场下北京市的动态最多
max(city_counter.items(), key=lambda i: i[1])
一共有406个经纬度信息
len(locations)
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
墨绿色的标记点即为动态的发布地点,可见:
%matplotlib inline
plt.figure(figsize=(20,10))
map = Basemap(projection='merc', llcrnrlon=70, llcrnrlat=15,
urcrnrlon=140, urcrnrlat=55, lat_0=15, lon_0=95, resolution='h')
map.drawcoastlines(linewidth=0.25)
map.drawcountries(linewidth=0.25)
map.fillcontinents(color='coral',lake_color='aqua')
map.drawmapboundary(fill_color='aqua')
for loc in locations:
x, y = map(*loc)
map.plot(x, y, 'go', markersize=4)
plt.title(' Jike POI Distribution ')
plt.show()
Prev: 乘车指南 🚇 Next: Jike Metro 🚇 中各个类的可用属性