JIeba文本处理

# encoding=utf-8
import jieba
 
# 启动paddle模式。 0.40版之后开始支持，早期版本不支持
jieba.enable_paddle()
strs=["我来到北京清华大学","乒乓球拍卖完了","中国科学技术大学"]
for str in strs:
    seg_list = jieba.cut(str,use_paddle=True) # 使用paddle模式
    print("Paddle Mode: " + '/'.join(list(seg_list)))
 
# 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))

# 精确模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))
 
# 默认是精确模式
seg_list = jieba.cut("他来到了网易杭研大厦")
print(", ".join(seg_list))
 
# 搜索引擎模式
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  
print(", ".join(seg_list))

输出：

'''
【全模式】: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
 
【精确模式】: 我/ 来到/ 北京/ 清华大学
 
【新词识别】：他, 来到, 了, 网易, 杭研, 大厦    (此处，“杭研”并没有在词典中，但是也被Viterbi算法识别出来了)
 
【搜索引擎模式】： 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造
'''

jieba三种模式

jieba.cut()：生成的是一个生成器，generator，也就是可以通过for循环来取里面的每一个词。
jieba.lcut()：生成一个list

import jieba

st = '注意：该代码tf计算使用的是整个语料，这里只是举个简单的例子，大家在写的时候按文档计算词频即可！我这里就不做修改了'

print(jieba.lcut(st))  # 精简模式
print(jieba.lcut(st, cut_all=True))  # 全模式，使用cut_all=True 指定
print(jieba.lcut_for_search(st))  # 搜素引擎模式

案例统计出现次数最多的

# -*- coding: utf-8 -*-
import jieba

txt = open("三国演义.txt", "r", encoding='utf-8').read()
words = jieba.lcut(txt)     # 使用精确模式对文本进行分词
counts = {}     # 通过键值对的形式存储词语及其出现的次数

for word in words:
    if len(word) == 1:    # 单个词语不计算在内
        continue
    else:
        counts[word] = counts.get(word, 0) + 1    # 遍历所有词语，每出现一次其对应的值加 1

items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)    # 根据词语出现的次数进行从大到小排序

for i in range(3):
    word, count = items[i]
    print("{0:<5}{1:>5}".format(word, count))

import re

import requests


class Solution:
    def aliceText(self, word: str) -> int:
        url = 'http://72.itmc.org.cn:80/JS001/data/user/16937/76/fj_alice_adventure.txt'
        response = requests.get(url)
        response.encoding = 'utf-8'
        words = response.text.lower()
        word = word.lower()  # 转小写
        counts = dict()

        for s in words.split():
            if len(s) <= 3:
                continue
            else:
                counts[s] = counts.get(s,0) + 1 # 遍历所有词语，每出现一次其对应的值加 1

        # 字典转换二维数组
        items = list(counts.items())#list 里面存放的是 元组
        # 排序
        items.sort(key=lambda x:x[1], reverse=True) # 降序
        print(items)
        return counts[word]

solu = Solution()
solu.aliceText('Pictures')

Jieba实现TF-IDF算法

import jieba.analyse

text = '''关键词是能够表达文档中心内容的词语，常用于计算机系统标引论文内容特征、
信息检索、系统汇集以供读者检阅。关键词提取是文本挖掘领域的一个分支，是文本检索、
文档比较、摘要生成、文档分类和聚类等文本挖掘研究的基础性工作'''

keywords = jieba.analyse.extract_tags(text, topK=5, withWeight=False, allowPOS=())
print(keywords)  # ['文档', '文本', '关键词', '挖掘', '文本检索']

公式：jieba.analyse.extract_tags(text, topK=5, withWeight=False, allowPOS=())

text：为待提取的文本

topK：权重最大的关键字，默认值为 20

withWeight：是否返回关键字的权重，默认为 False

allowPOS：指定词性，默认为空，即不筛选

Jieba实现Textrank算法

import jieba.analyse

text = '''关键词是能够表达文档中心内容的词语，常用于计算机系统标引论文内容特征、
信息检索、系统汇集以供读者检阅。关键词提取是文本挖掘领域的一个分支，是文本检索、
文档比较、摘要生成、文档分类和聚类等文本挖掘研究的基础性工作'''

keyword = jieba.analyse.textrank(text, topK=5, withWeight=False, allowPOS=('n', 'ns', 'nr'))
print(keyword)  # ['文档', '内容', '文本', '特征', '词语']

中文文本：《三国演义》分析人物https://python123.io/resources/pye/threekingdoms.txt

'''
中文文本分词,使用字典表达词频
'''
import requests
import jieba

class Solution:
    def fun(self, ren_name: str) -> int:
        url = 'https://python123.io/resources/pye/threekingdoms.txt'
        r = requests.get(url)
        r.encoding = 'utf-8'
        words = jieba.lcut(r.text)
        name = dict()
        jieba
        for word in words:
            if len(word) < 1:
                continue
            elif word in ('诸葛亮','孔明曰'):
                rword = '孔明'
            elif word in ('关公','云长'):
                rword = '关羽'
            elif word in ('玄德','玄德曰'):
                rword = '刘备'
            elif word in ('孟德', '丞相'):
                rword = '曹操'
            else:
                rword = word
            name[rword] = name.get(rword, 0) + 1
        return name[ren_name]

s = Solution()
print(s.fun('曹操'))