导出chrome书签为Markdown

  • 用python解析html,读取h3标签和a标签,写入文件,代码:

import os
from html.parser import HTMLParser

from utils.file import file_utils


class MyHTMLParser(HTMLParser):
    is_a = False
    is_h3 = False
    links = []
    cur_tag_key = ''
    cur_tag_value = ''

    def __init__(self):
        HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        # print "Encountered the beginning of a %s tag" % tag
        if tag == 'a':
            self.is_a = True
            if len(attrs) == 0:
                pass
            else:
                for (variable, value) in attrs:
                    if variable == "href":
                        self.cur_tag_value = value
        else:
            self.is_h3 = True

    def handle_data(self, data):
        if self.is_a:
            self.cur_tag_key = data
        elif self.is_h3:
            self.cur_tag_key = data
            self.cur_tag_value = 'h3'

    def handle_endtag(self, tag):
        if tag == 'a' or tag == 'h3':
            self.is_a = False
            self.is_h3 = False
            if self.cur_tag_key == '' and self.cur_tag_value == '':
                pass
            else:
                self.links.append([self.cur_tag_key, self.cur_tag_value])
                self.cur_tag_key = ''
                self.cur_tag_value = ''


def get_links():
    html_code = file_utils.read2mem('/home/cwh/Mission/bookmarks_16_3_23.html')
    hp = MyHTMLParser()
    hp.feed(html_code)
    hp.close()
    try:
        os.remove('star.md')
    except FileNotFoundError:
        pass
    file_utils.append2file('star.md', '#我的收藏\n>他山之石,可以攻玉\n\n开发过程中收藏在Chrome书签栏里的技术文章,独立出来\n\n转换方式:')
    for word in hp.links:
        if word[1] == 'h3':
            file_utils.append2file('star.md', '##' + word[0] + '\n\n')
            print(word[0] + '\n')
        else:
            file_utils.append2file('star.md', '- [' + word[0] + '](' + word[1] + ')\n\n')
            print('- [' + word[0] + '](' + word[1] + ')\n')


get_links()

其中fileutil是我的一个文件操作模块,可以在这里看到对应的代码,

转换的效果即本博客中的我的收藏部分

最后更新于