百度文库文档下载
本文最后更新于54 天前,其中的信息可能已经过时,如有错误请发送邮件到aing117@163.com
import requests


# 请求链接
url = 'https://wenku.baidu.com/gsearch/rec/pcviewdocrec2023'
# 请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
}
# 请求参数
data = {
    'sessionId': '2914176871-0122235879--',
    'docId': '54ae6ee1fd0a79563c1e72cd',
    'query': '心电图机十二导联分别接在什么位置',
    'recPositions': 'catalog,toplist'
}
# 发送请求
response = requests.get(url=url, params=data, headers=headers)
print(response.json())
# for循环遍历
num = 1
for index in response.json()['data']['catalogDoc']:
    pic = index['pic']
    # 保存图片
    img_content = requests.get(url=pic, headers=headers).content
    with open('img\\' + str(num) + '.jpg', mode='wb') as f:
        f.write(img_content)
    num += 1

    print(pic)

OCR api接口


# encoding:utf-8
import requests
import base64

from docx import Document
# client_id 为官网获取的AK, client_secret 为官网获取的SK
host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=YKTOwRiLQsBp7PmZLMSutrw3&client_secret=nbXupaOxl7ByhVz2xa8eVvueZVBqJObe'
response = requests.get(host)
access_token = response.json()['access_token']
# if response:
#     print(response.json())

'''
通用文字识别(高精度版)
'''

request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
# 二进制方式打开图片文件
f = open('img\\1.jpg', 'rb')
img = base64.b64encode(f.read())
params = {"image":img}
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
# if response:
#     print (response.json())
words = '\n'.join([i['words'] for i in response.json()['words_result']])
doc = Document()
doc.add_paragraph(words)
doc.save('OCR百度云接口测试.docx')

api接口优化


# encoding:utf-8
import requests
import base64
import os
from docx import Document

def get_content(file):
    # client_id 为官网获取的AK, client_secret 为官网获取的SK
    host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=YKTOwRiLQsBp7PmZLMSutrw3&client_secret=nbXupaOxl7ByhVz2xa8eVvueZVBqJObe'
    response = requests.get(host)
    access_token = response.json()['access_token']
    # if response:
    #     print(response.json())

    '''
    通用文字识别(高精度版)
    '''

    request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
    # 二进制方式打开图片文件
    f = open(file, 'rb')
    img = base64.b64encode(f.read())
    params = {"image":img}
    request_url = request_url + "?access_token=" + access_token
    headers = {'content-type': 'application/x-www-form-urlencoded'}
    response = requests.post(request_url, data=params, headers=headers)
    words = '\n'.join([i['words'] for i in response.json()['words_result']])
    # if response:
    #     print (response.json())
    return words

files = os.listdir('E:\\pr导出文件\\0830\\')
word_list = []
for file in files:
    filename = 'E:\\pr导出文件\\0830\\' + file
    words = get_content(file=filename)
    word_list.append(words)

doc = Document()
doc.add_paragraph('\n'.join(word_list))
doc.save('0830.docx')
暂无评论

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
颜文字
Emoji
小恐龙
花!
上一篇
下一篇