Categories: Python爬虫

百度文库文档下载

import requests


# 请求链接
url = 'https://wenku.baidu.com/gsearch/rec/pcviewdocrec2023'
# 请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
}
# 请求参数
data = {
    'sessionId': '2914176871-0122235879--',
    'docId': '54ae6ee1fd0a79563c1e72cd',
    'query': '心电图机十二导联分别接在什么位置',
    'recPositions': 'catalog,toplist'
}
# 发送请求
response = requests.get(url=url, params=data, headers=headers)
print(response.json())
# for循环遍历
num = 1
for index in response.json()['data']['catalogDoc']:
    pic = index['pic']
    # 保存图片
    img_content = requests.get(url=pic, headers=headers).content
    with open('img\\' + str(num) + '.jpg', mode='wb') as f:
        f.write(img_content)
    num += 1

    print(pic)

OCR api接口


# encoding:utf-8
import requests
import base64

from docx import Document
# client_id 为官网获取的AK， client_secret 为官网获取的SK
host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=YKTOwRiLQsBp7PmZLMSutrw3&client_secret=nbXupaOxl7ByhVz2xa8eVvueZVBqJObe'
response = requests.get(host)
access_token = response.json()['access_token']
# if response:
#     print(response.json())

'''
通用文字识别（高精度版）
'''

request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
# 二进制方式打开图片文件
f = open('img\\1.jpg', 'rb')
img = base64.b64encode(f.read())
params = {"image":img}
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
# if response:
#     print (response.json())
words = '\n'.join([i['words'] for i in response.json()['words_result']])
doc = Document()
doc.add_paragraph(words)
doc.save('OCR百度云接口测试.docx')

api接口优化


# encoding:utf-8
import requests
import base64
import os
from docx import Document

def get_content(file):
    # client_id 为官网获取的AK， client_secret 为官网获取的SK
    host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=YKTOwRiLQsBp7PmZLMSutrw3&client_secret=nbXupaOxl7ByhVz2xa8eVvueZVBqJObe'
    response = requests.get(host)
    access_token = response.json()['access_token']
    # if response:
    #     print(response.json())

    '''
    通用文字识别（高精度版）
    '''

    request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
    # 二进制方式打开图片文件
    f = open(file, 'rb')
    img = base64.b64encode(f.read())
    params = {"image":img}
    request_url = request_url + "?access_token=" + access_token
    headers = {'content-type': 'application/x-www-form-urlencoded'}
    response = requests.post(request_url, data=params, headers=headers)
    words = '\n'.join([i['words'] for i in response.json()['words_result']])
    # if response:
    #     print (response.json())
    return words

files = os.listdir('E:\\pr导出文件\\0830\\')
word_list = []
for file in files:
    filename = 'E:\\pr导出文件\\0830\\' + file
    words = get_content(file=filename)
    word_list.append(words)

doc = Document()
doc.add_paragraph('\n'.join(word_list))
doc.save('0830.docx')

aing117