import json
import nltk
import os
import pprint
import random
import simpleneighbors
import urllib
from IPython.display import HTML, display
from tqdm.notebook import tqdm
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer'punkt')
def download_squad(url):
return json.load(urllib.request.urlopen(url))
def extract_sentences_from_squad_json(squad):
all_sentences = []
for data in squad['data']:
for paragraph in data['paragraphs']:
sentences = nltk.tokenize.sent_tokenize(paragraph['context'])
all_sentences.extend(zip(sentences, [paragraph['context']] * len(sentences)))
return list(set(all_sentences)) # remove duplicates
def extract_questions_from_squad_json(squad):
questions = []
for data in squad['data']:
for paragraph in data['paragraphs']:
for qas in paragraph['qas']:
if qas['answers']:
questions.append((qas['question'], qas['answers'][0]['text']))
return list(set(questions))
def output_with_highlight(text, highlight):
output = "<li> "
i = text.find(highlight)
while True:
if i == -1:
output += text
output += text[0:i]
output += '<b>'+text[i:i+len(highlight)]+'</b>'
text = text[i+len(highlight):]
i = text.find(highlight)
return output + "</li>\n"
def display_nearest_neighbors(query_text, answer_text=None):
query_embedding = model.signatures['question_encoder'](tf.constant([query_text]))['outputs'][0]
search_results = index.nearest(query_embedding, n=num_results)
if answer_text:
result_md = '''
<p>Random Question from SQuAD:</p>
<p> <b>%s</b></p>
<p> <b>%s</b></p>
''' % (query_text , answer_text)
result_md = '''
<p> <b>%s</b></p>
''' % query_text
result_md += '''
<p>Retrieved sentences :
if answer_text:
for s in search_results:
result_md += output_with_highlight(s, answer_text)
for s in search_results:
result_md += '<li>' + s + '</li>\n'
result_md += "</ol>"
10455 sentences, 10552 questions extracted from SQuAD
Example sentence and context:
('Oxygen gas is increasingly obtained by these non-cryogenic technologies (see '
'also the related vacuum swing adsorption).')
('The other major method of producing O\n'
'2 gas involves passing a stream of clean, dry air through one bed of a pair '
'of identical zeolite molecular sieves, which absorbs the nitrogen and '
'delivers a gas stream that is 90% to 93% O\n'
'2. Simultaneously, nitrogen gas is released from the other '
'nitrogen-saturated zeolite bed, by reducing the chamber operating pressure '
'and diverting part of the oxygen gas from the producer bed through it, in '
'the reverse direction of flow. After a set cycle time the operation of the '
'two beds is interchanged, thereby allowing for a continuous supply of '
'gaseous oxygen to be pumped through a pipeline. This is known as pressure '
'swing adsorption. Oxygen gas is increasingly obtained by these non-cryogenic '
'technologies (see also the related vacuum swing adsorption).')
以下程式碼區塊使用「通用編碼器多語言問答模型」的 question_encoder 和 response_encoder 簽名來設定 tensorflow 圖形 g 和工作階段。
從 tensorflow hub 載入模型
module_url = ""
model = hub.load(module_url)
batch_size = 100
encodings = model.signatures['response_encoder'](
index = simpleneighbors.SimpleNeighbors(
len(encodings['outputs'][0]), metric='angular')
print('Computing embeddings for %s sentences' % len(sentences))
slices = zip(*(iter(sentences),) * batch_size)
num_batches = int(len(sentences) / batch_size)
for s in tqdm(slices, total=num_batches):
response_batch = list([r for r, c in s])
context_batch = list([c for r, c in s])
encodings = model.signatures['response_encoder'](
for batch_index, batch in enumerate(response_batch):
index.add_one(batch, encodings['outputs'][batch_index])
print('simpleneighbors index for %s sentences built.' % len(sentences))
Computing embeddings for 10455 sentences
simpleneighbors index for 10455 sentences built.