基于Python的中国大学MOOC爬虫练手

Rhilip 2017-01-16 PM 2322℃ 0条

项目地址:Rhilip/icourse163-dl

[gitwidget type='github' url='Rhilip/cn-mooc-dl']

这个爬虫更新到v20170116(93ea55d4d215f675b8e2fae8003e19c03ceed0ad)这个版本,也算是差不多完成了吧。今后可能就不做更新了。

最早开始抓MOOC的时候,用的是插件的方法读视频的地址,如果没有记错的话,应该是Flash Video Downloader这个插件吧。当视频链接被读到后,就能插件被嗅探出来。早期发在byrbt的MOOC课程都是使用这种方法(因为那时候是为了自己平时使用方便临时抓的),按编号来说应该都在20之前滴说。

顺带那时候还没有发布MOOC的任何计划,就连教育网PT站点是什么都不知道QAQ。。。

然后很长时间我是用F12来读视频地址的。相比早期的插件法,可以很好的防止网易的垃圾服务器长时间无响应(也不知道是学校垃圾校园网的问题还是网易的问题。。。)。F12打开Chrome的开发者工具,在Network面板中用Filter过滤出来mp4。这种方法我大概抓了20-40门课。那时候感觉这真的是一件体力活。。。。

然后我开始动手写这个专门拿来抓中国大学MOOC下载链接的脚本了。。

不过自从大一学完了C语言后,基本就没写过程序(毕竟不是计算机专业2333)。还是遇到了很多的问题。比如如何解决登陆验证(后来用Session和Cookies绕过,带来的麻烦就是每隔一段时间就要更新)、比如如何清洗dwr交互文件的信息(后来直接用re来洗了)以及如何处理下载课程文档和字幕(脚本开源在github后多数commit都是和这个有关)等。

脚本的初稿(差不多就是611615d这个样子)大概写了半天多吧,得益于我看文档理解运用的能力。。(一个没有用过Python来编程的孩子心里痛QAQ

可以看出来,文件还是有C的影子(哪怕现在也是2333,

另外,抓了这么多中国大学MOOC。不得不吐槽部分学校发布的课程简直不能看,命名还统一都是“课程视频”,真是呵呵了。

关于脚本,如果真要说的话,应该还没有完全完成吧。关于抓取说明和课程介绍的txt文件、课程的封面图和介绍视频等(写了一些后来弃坑了,引入bs4也就是为了这个);抓取课程的时候统一只抓视频和文档,没有对章节进行处理(见下面的示例代码),造成了后期整理课程视频时候的麻烦。待他人fork后跟进吧,或者什么时候我又提起修改代码的兴趣(16年末一堆课程完结真是抓吐了我滴说)

s19.chapterId=1002140025;s19.contentId=null;s19.contentType=1;s19.gmtCreate=1476768213802;s19.gmtModified=1476768213802;s19.id=1002445198;s19.isTestChecked=false;s19.name="1-2 \u8BA1\u7B97\u673A\u786C\u4EF6\u7CFB\u7EDF";s19.position=1;s19.releaseTime=1476768600000;s19.termId=1001877005;s19.test=null;s19.testDraftStatus=0;s19.units=s28;s19.viewStatus=0;

更多的是v20170116的脚本备份,建议访问github查看~

# -*- coding: utf-8 -*-
import requests, random, re, os
from bs4 import BeautifulSoup
from http.cookies import SimpleCookie
from urllib.parse import unquote

# -*- Config
# Warning:Before start ,You should fill in these forms.
# Course url (with key "tid")
course_url = ''
# Session
httpSessionId = ''
# cookies
raw_cookies = ''
# Post Header(Don't change)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Content-Type': 'text/plain',
}

downloadSrt = True  # Download Chinese or English Srt (True or False)
downloadVideoType = ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl',
                     'flvShdUrl', 'flvHdUrl', 'flvSdUrl']  # Choose first video download link(if exists)

# -*- Api
# Arrange Cookies from raw
cookie = SimpleCookie()
cookie.load(raw_cookies)
cookies = {}
for key, morsel in cookie.items():
    cookies[key] = morsel.value


# getLessonUnitLearnVo (This funciton will return a dict with download info)
def getLessonUnitLearnVo(contentId, id, contentType):
    # prepare data and post
    payload = {
        'callCount': 1,
        'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
        'httpSessionId': httpSessionId,
        'c0-scriptName': 'CourseBean',
        'c0-methodName': 'getLessonUnitLearnVo',
        'c0-id': 1,
        'c0-param0': contentId,
        'c0-param1': contentType,
        'c0-param2': 0,
        'c0-param3': id,
        'batchId': random.randint(1000000000000, 20000000000000)
    }
    cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr'

    rdata = requests.post(cs_url, data=payload, headers=headers, cookies=cookies, timeout=None).text
    # print(rdata)
    info = {}  # info.clear()
    # Sort data depend on it's contentType into dict info
    if contentType == 1:  # Video
        info['videoImgUrl'] = str(re.search(r's\d+.videoImgUrl="(.+?)";', rdata).group(1))

        video_type = []    # Get Video download type
        for k in downloadVideoType:
            if re.search(r's\d+.'+ str(k) + '=".+?";', rdata):
                info[k] = str(re.search(r's\d+.'+ str(k) + r'="(.+?\.mp4).+?";', rdata).group(1))
                video_type.append(k)
        # type of resulting video
        info["videoType"] = video_type

        # Subtitle
        if re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(.+?)"', rdata):  # Chinese
            info['ChsSrt'] = str(re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(.+?)"', rdata).group(1))
        if re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(.+?)"', rdata):  # English
            info['EngSrt'] = str(re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(.+?)"', rdata).group(1))

    # if contentType == 2: # Test
    if contentType == 3:  # Documentation
        info['textOrigUrl'] = str(re.search(r'textOrigUrl:"(.+?)"', rdata).group(1))
    # if contentType == 4:  # Rich text
    # if contentType == 5:  # Examination
    # if contentType == 6:  # Discussion

    # print(info)
    return info


# Structure lesson(This funciton will return a dict with lesson info)
def sort_lesson(index):
    return dict(
        contentType=int(re.search(r'.contentType=(\d+);', index).group(1)),
        name=str(re.search(r'.name="(.+)";', index).group(1))
            .replace(r'\n', '')
            .encode('utf-8').decode('unicode_escape')
            .encode('gbk', 'ignore').decode('gbk', 'ignore')
            .replace('/', '_').replace(':', ':').replace('"', ''),
        info=getLessonUnitLearnVo(re.search(r'.contentId=(\d+);', index).group(1),
                                  re.search(r'.id=(\d+);', index).group(1),
                                  int(re.search(r'.contentType=(\d+);', index).group(1))),
    )


# Download things
def downloadCourseware(path, link, filename):
    if not os.path.exists(path):
        os.makedirs(path)
    r = requests.get(link)
    with open(path + "\\" + filename, "wb") as code:
        code.write(r.content)
        print("Download \"" + filename + "\" OK!")


# -*- End of Api

# -*- Main
def main():
    # handle the course_url links to Get right courseId and termId
    if not re.search(r'([A-Za-z]*-\d*)', course_url):
        print("No course Id,Please check!")
        return
    else:
        courseId = re.search(r'([A-Za-z]*-\d*)', course_url).group(1)
        bs = BeautifulSoup(requests.get(url="http://www.icourse163.org/course/" + courseId + "#/info", timeout=None).text, "lxml")
        course_info_raw = bs.find("script", text=re.compile(r"termDto")).string
        if re.search(r'tid', course_url):
            tid = re.search(r'tid=(\d+)', course_url).group(1)
        else:
            print("No termId which you want to download.Will Choose the Lastest term.")
            tid = re.search(r"termId : \"(\d+)\"", course_info_raw).group(1)

        print('Begin~')
        # Generate Grab information
        course_name = re.search(r'(.+?)_(.+?)_(.+?)', bs.title.string).group(1)
        school_name = re.search(r'(.+?)_(.+?)_(.+?)', bs.title.string).group(2)
        teacher_name = []
        for i in bs.find_all('h3', class_="f-fc3"):
            teacher_name.append(i.string)
            if len(teacher_name) >= 3:
                teacher_name[2] += '等'
                break
        teacher_name = '、'.join(teacher_name)
        path = course_name + '-' + school_name + '-' + teacher_name
        print("The Download INFO:\nCourse:" + path + "\nid: " + courseId + "\ntermID:" + tid)

        # Make course's dir
        if not os.path.exists(path):
            os.makedirs(path)

        # Get course's chapter
        cont = [0, 0]  # count
        payload = {
            'callCount': 1,
            'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
            'httpSessionId': httpSessionId,
            'c0-scriptName': 'CourseBean',
            'c0-methodName': 'getLastLearnedMocTermDto',
            'c0-id': 0,
            'c0-param0': tid,
            'batchId': random.randint(1000000000000, 20000000000000)
        }
        cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr'
        rdata = requests.post(cs_url, data=payload, headers=headers, cookies=cookies, timeout=None).text
        # print(rdata)
        if re.search(r'var s\d+=\{\}', rdata):
            rdata = rdata.splitlines()  # str -> list
            # Data cleaning
            for index in rdata:
                # Structure lesson
                if re.match(r's(\d+).anchorQuestions=', index):
                    lesson = sort_lesson(index)
                    lessontype = lesson['contentType']
                    if lessontype == 1:  # Video
                        bestvideo = lesson['info'].get('videoType')  # Choose download video Type
                        # Output video download link
                        dllink = lesson['info'].get(bestvideo[0])
                        open(path + "\\dllink.txt", "a").write(dllink + "\n")
                        # Output video rename command
                        dlfile = re.search(r'/(\d+?_.+?\.(mp4|flv))', dllink).group(1)
                        videotype = re.search(r'^(flv|mp4)(Sd|Hd|Shd)Url', str(bestvideo[0]))
                        if str(videotype.group(2)) == "Shd":
                            new = "ren " + dlfile + " \"" + str(lesson.get('name')) + "." + str(
                                videotype.group(1)) + "\"\n"
                        else:
                            new = "ren " + dlfile + " \"" + str(lesson.get('name')) + "_" + str(
                                videotype.group(2)) + "." + str(videotype.group(1)) + "\"\n"
                        print("Find Video\n" + str(lesson.get('name')) + " : "+ dllink)
                        open(path + "\\ren.bat", "a").write(new)
                        cont[0] += 1
                        # Subtitle
                        if downloadSrt:
                            if lesson['info'].get('ChsSrt'):
                                print("Find Chinese Subtitle for this lesson,Begin download.")
                                downloadCourseware(path=path + "\\" + "srt",
                                                   link=str(lesson['info'].get('ChsSrt')),
                                                   filename=str(lesson.get('name')) + '.chs.srt')

                            if lesson['info'].get('EngSrt'):
                                print("Find English Subtitle for this lesson,Begin download.")
                                downloadCourseware(path=path + "\\" + "srt",
                                                   link=str(lesson['info'].get('EngSrt')),
                                                   filename=str(lesson.get('name')) + '.eng.srt')

                    if lessontype == 3:  # Documentation
                        wdlink = lesson['info'].get('textOrigUrl')
                        # print(wdlink)
                        print("Find Document,Begin download.")
                        downloadCourseware(path=path + "\\" + "docs",
                                           link=wdlink,
                                           filename=str(cont[1]) + " " + 
                                           unquote(re.search(r'&download=(.+)', wdlink).group(1)).replace("+", " "))
                        cont[1] += 1
            print("Found {0} Video(es),and {1} Text(s) on this page".format(cont[0], cont[1]))
        else:
            print("Error:" + re.search(r'message:(.+)\}\)', rdata).group(
                1) + ",Please make sure you login by 163-email and your \"Session-Cookies\" pair is right.")

if __name__ == '__main__':
    main()
标签: Python, MOOC, icourse163-dl

非特殊说明,本博所有文章均为博主原创。

评论啦~