项目地址:Rhilip/icourse163-dl

这个爬虫更新到v20170116(93ea55d4d215f675b8e2fae8003e19c03ceed0ad)这个版本,也算是差不多完成了吧。今后可能就不做更新了。

最早开始抓MOOC的时候,用的是插件的方法读视频的地址,如果没有记错的话,应该是Flash Video Downloader这个插件吧。当视频链接被读到后,就能插件被嗅探出来。早期发在byrbt的MOOC课程都是使用这种方法(因为那时候是为了自己平时使用方便临时抓的),按编号来说应该都在20之前滴说。

顺带那时候还没有发布MOOC的任何计划,就连教育网PT站点是什么都不知道QAQ。。。

然后很长时间我是用F12来读视频地址的。相比早期的插件法,可以很好的防止网易的垃圾服务器长时间无响应(也不知道是学校垃圾校园网的问题还是网易的问题。。。)。F12打开Chrome的开发者工具,在Network面板中用Filter过滤出来mp4。这种方法我大概抓了20-40门课。那时候感觉这真的是一件体力活。。。。

然后我开始动手写这个专门拿来抓中国大学MOOC下载链接的脚本了。。

不过自从大一学完了C语言后,基本就没写过程序(毕竟不是计算机专业2333)。还是遇到了很多的问题。比如如何解决登陆验证(后来用Session和Cookies绕过,带来的麻烦就是每隔一段时间就要更新)、比如如何清洗dwr交互文件的信息(后来直接用re来洗了)以及如何处理下载课程文档和字幕(脚本开源在github后多数commit都是和这个有关)等。

脚本的初稿(差不多就是 611615d 这个样子)大概写了半天多吧,得益于我看文档理解运用的能力。。(一个没有用过Python来编程的孩子心里痛QAQ

可以看出来,文件还是有C的影子(哪怕现在也是2333,

另外,抓了这么多中国大学MOOC。不得不吐槽部分学校发布的课程简直不能看,命名还统一都是“课程视频”,真是呵呵了。

关于脚本,如果真要说的话,应该还没有完全完成吧。关于抓取说明和课程介绍的txt文件、课程的封面图和介绍视频等(写了一些后来弃坑了,引入bs4也就是为了这个);抓取课程的时候统一只抓视频和文档,没有对章节进行处理(见下面的示例代码),造成了后期整理课程视频时候的麻烦。待他人fork后跟进吧,或者什么时候我又提起修改代码的兴趣(16年末一堆课程完结真是抓吐了我滴说)

1
s19.chapterId=1002140025;s19.contentId=null;s19.contentType=1;s19.gmtCreate=1476768213802;s19.gmtModified=1476768213802;s19.id=1002445198;s19.isTestChecked=false;s19.name="1-2 \u8BA1\u7B97\u673A\u786C\u4EF6\u7CFB\u7EDF";s19.position=1;s19.releaseTime=1476768600000;s19.termId=1001877005;s19.test=null;s19.testDraftStatus=0;s19.units=s28;s19.viewStatus=0;

更多的是v20170116的脚本备份,建议访问github查看~

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# -*- coding: utf-8 -*-
import requests, random, re, os
from bs4 import BeautifulSoup
from http.cookies import SimpleCookie
from urllib.parse import unquote

# -*- Config
# Warning:Before start ,You should fill in these forms.
# Course url (with key "tid")
course_url = ''
# Session
httpSessionId = ''
# cookies
raw_cookies = ''
# Post Header(Don't change)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Content-Type': 'text/plain',
}

downloadSrt = True # Download Chinese or English Srt (True or False)
downloadVideoType = ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl',
'flvShdUrl', 'flvHdUrl', 'flvSdUrl'] # Choose first video download link(if exists)

# -*- Api
# Arrange Cookies from raw
cookie = SimpleCookie()
cookie.load(raw_cookies)
cookies = {}
for key, morsel in cookie.items():
cookies[key] = morsel.value


# getLessonUnitLearnVo (This funciton will return a dict with download info)
def getLessonUnitLearnVo(contentId, id, contentType):
# prepare data and post
payload = {
'callCount': 1,
'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
'httpSessionId': httpSessionId,
'c0-scriptName': 'CourseBean',
'c0-methodName': 'getLessonUnitLearnVo',
'c0-id': 1,
'c0-param0': contentId,
'c0-param1': contentType,
'c0-param2': 0,
'c0-param3': id,
'batchId': random.randint(1000000000000, 20000000000000)
}
cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr'

rdata = requests.post(cs_url, data=payload, headers=headers, cookies=cookies, timeout=None).text
# print(rdata)
info = {} # info.clear()
# Sort data depend on it's contentType into dict info
if contentType == 1: # Video
info['videoImgUrl'] = str(re.search(r's\d+.videoImgUrl="(.+?)";', rdata).group(1))

video_type = [] # Get Video download type
for k in downloadVideoType:
if re.search(r's\d+.'+ str(k) + '=".+?";', rdata):
info[k] = str(re.search(r's\d+.'+ str(k) + r'="(.+?\.mp4).+?";', rdata).group(1))
video_type.append(k)
# type of resulting video
info["videoType"] = video_type

# Subtitle
if re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(.+?)"', rdata): # Chinese
info['ChsSrt'] = str(re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(.+?)"', rdata).group(1))
if re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(.+?)"', rdata): # English
info['EngSrt'] = str(re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(.+?)"', rdata).group(1))

# if contentType == 2: # Test
if contentType == 3: # Documentation
info['textOrigUrl'] = str(re.search(r'textOrigUrl:"(.+?)"', rdata).group(1))
# if contentType == 4: # Rich text
# if contentType == 5: # Examination
# if contentType == 6: # Discussion

# print(info)
return info


# Structure lesson(This funciton will return a dict with lesson info)
def sort_lesson(index):
return dict(
contentType=int(re.search(r'.contentType=(\d+);', index).group(1)),
name=str(re.search(r'.name="(.+)";', index).group(1))
.replace(r'\n', '')
.encode('utf-8').decode('unicode_escape')
.encode('gbk', 'ignore').decode('gbk', 'ignore')
.replace('/', '_').replace(':', ':').replace('"', ''),
info=getLessonUnitLearnVo(re.search(r'.contentId=(\d+);', index).group(1),
re.search(r'.id=(\d+);', index).group(1),
int(re.search(r'.contentType=(\d+);', index).group(1))),
)


# Download things
def downloadCourseware(path, link, filename):
if not os.path.exists(path):
os.makedirs(path)
r = requests.get(link)
with open(path + "\\" + filename, "wb") as code:
code.write(r.content)
print("Download \"" + filename + "\" OK!")


# -*- End of Api

# -*- Main
def main():
# handle the course_url links to Get right courseId and termId
if not re.search(r'([A-Za-z]*-\d*)', course_url):
print("No course Id,Please check!")
return
else:
courseId = re.search(r'([A-Za-z]*-\d*)', course_url).group(1)
bs = BeautifulSoup(requests.get(url="http://www.icourse163.org/course/" + courseId + "#/info", timeout=None).text, "lxml")
course_info_raw = bs.find("script", text=re.compile(r"termDto")).string
if re.search(r'tid', course_url):
tid = re.search(r'tid=(\d+)', course_url).group(1)
else:
print("No termId which you want to download.Will Choose the Lastest term.")
tid = re.search(r"termId : \"(\d+)\"", course_info_raw).group(1)

print('Begin~')
# Generate Grab information
course_name = re.search(r'(.+?)_(.+?)_(.+?)', bs.title.string).group(1)
school_name = re.search(r'(.+?)_(.+?)_(.+?)', bs.title.string).group(2)
teacher_name = []
for i in bs.find_all('h3', class_="f-fc3"):
teacher_name.append(i.string)
if len(teacher_name) >= 3:
teacher_name[2] += '等'
break
teacher_name = '、'.join(teacher_name)
path = course_name + '-' + school_name + '-' + teacher_name
print("The Download INFO:\nCourse:" + path + "\nid: " + courseId + "\ntermID:" + tid)

# Make course's dir
if not os.path.exists(path):
os.makedirs(path)

# Get course's chapter
cont = [0, 0] # count
payload = {
'callCount': 1,
'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
'httpSessionId': httpSessionId,
'c0-scriptName': 'CourseBean',
'c0-methodName': 'getLastLearnedMocTermDto',
'c0-id': 0,
'c0-param0': tid,
'batchId': random.randint(1000000000000, 20000000000000)
}
cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr'
rdata = requests.post(cs_url, data=payload, headers=headers, cookies=cookies, timeout=None).text
# print(rdata)
if re.search(r'var s\d+=\{\}', rdata):
rdata = rdata.splitlines() # str -> list
# Data cleaning
for index in rdata:
# Structure lesson
if re.match(r's(\d+).anchorQuestions=', index):
lesson = sort_lesson(index)
lessontype = lesson['contentType']
if lessontype == 1: # Video
bestvideo = lesson['info'].get('videoType') # Choose download video Type
# Output video download link
dllink = lesson['info'].get(bestvideo[0])
open(path + "\\dllink.txt", "a").write(dllink + "\n")
# Output video rename command
dlfile = re.search(r'/(\d+?_.+?\.(mp4|flv))', dllink).group(1)
videotype = re.search(r'^(flv|mp4)(Sd|Hd|Shd)Url', str(bestvideo[0]))
if str(videotype.group(2)) == "Shd":
new = "ren " + dlfile + " \"" + str(lesson.get('name')) + "." + str(
videotype.group(1)) + "\"\n"
else:
new = "ren " + dlfile + " \"" + str(lesson.get('name')) + "_" + str(
videotype.group(2)) + "." + str(videotype.group(1)) + "\"\n"
print("Find Video\n" + str(lesson.get('name')) + " : "+ dllink)
open(path + "\\ren.bat", "a").write(new)
cont[0] += 1
# Subtitle
if downloadSrt:
if lesson['info'].get('ChsSrt'):
print("Find Chinese Subtitle for this lesson,Begin download.")
downloadCourseware(path=path + "\\" + "srt",
link=str(lesson['info'].get('ChsSrt')),
filename=str(lesson.get('name')) + '.chs.srt')

if lesson['info'].get('EngSrt'):
print("Find English Subtitle for this lesson,Begin download.")
downloadCourseware(path=path + "\\" + "srt",
link=str(lesson['info'].get('EngSrt')),
filename=str(lesson.get('name')) + '.eng.srt')

if lessontype == 3: # Documentation
wdlink = lesson['info'].get('textOrigUrl')
# print(wdlink)
print("Find Document,Begin download.")
downloadCourseware(path=path + "\\" + "docs",
link=wdlink,
filename=str(cont[1]) + " " +
unquote(re.search(r'&download=(.+)', wdlink).group(1)).replace("+", " "))
cont[1] += 1
print("Found {0} Video(es),and {1} Text(s) on this page".format(cont[0], cont[1]))
else:
print("Error:" + re.search(r'message:(.+)\}\)', rdata).group(
1) + ",Please make sure you login by 163-email and your \"Session-Cookies\" pair is right.")

if __name__ == '__main__':
main()