本文最后更新于54 天前,其中的信息可能已经过时,如有错误请发送邮件到aing117@163.com
import requests
# 请求链接
url = 'https://wenku.baidu.com/gsearch/rec/pcviewdocrec2023'
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
}
# 请求参数
data = {
'sessionId': '2914176871-0122235879--',
'docId': '54ae6ee1fd0a79563c1e72cd',
'query': '心电图机十二导联分别接在什么位置',
'recPositions': 'catalog,toplist'
}
# 发送请求
response = requests.get(url=url, params=data, headers=headers)
print(response.json())
# for循环遍历
num = 1
for index in response.json()['data']['catalogDoc']:
pic = index['pic']
# 保存图片
img_content = requests.get(url=pic, headers=headers).content
with open('img\\' + str(num) + '.jpg', mode='wb') as f:
f.write(img_content)
num += 1
print(pic)
OCR api接口
# encoding:utf-8
import requests
import base64
from docx import Document
# client_id 为官网获取的AK, client_secret 为官网获取的SK
host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=YKTOwRiLQsBp7PmZLMSutrw3&client_secret=nbXupaOxl7ByhVz2xa8eVvueZVBqJObe'
response = requests.get(host)
access_token = response.json()['access_token']
# if response:
# print(response.json())
'''
通用文字识别(高精度版)
'''
request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
# 二进制方式打开图片文件
f = open('img\\1.jpg', 'rb')
img = base64.b64encode(f.read())
params = {"image":img}
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
# if response:
# print (response.json())
words = '\n'.join([i['words'] for i in response.json()['words_result']])
doc = Document()
doc.add_paragraph(words)
doc.save('OCR百度云接口测试.docx')
api接口优化
# encoding:utf-8
import requests
import base64
import os
from docx import Document
def get_content(file):
# client_id 为官网获取的AK, client_secret 为官网获取的SK
host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=YKTOwRiLQsBp7PmZLMSutrw3&client_secret=nbXupaOxl7ByhVz2xa8eVvueZVBqJObe'
response = requests.get(host)
access_token = response.json()['access_token']
# if response:
# print(response.json())
'''
通用文字识别(高精度版)
'''
request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
# 二进制方式打开图片文件
f = open(file, 'rb')
img = base64.b64encode(f.read())
params = {"image":img}
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
words = '\n'.join([i['words'] for i in response.json()['words_result']])
# if response:
# print (response.json())
return words
files = os.listdir('E:\\pr导出文件\\0830\\')
word_list = []
for file in files:
filename = 'E:\\pr导出文件\\0830\\' + file
words = get_content(file=filename)
word_list.append(words)
doc = Document()
doc.add_paragraph('\n'.join(word_list))
doc.save('0830.docx')