G

Untitled

public
Guest Jul 09, 2024 Never 66
Clone
Plaintext paste1.txt 84 lines (68 loc) | 2.87 KB
1
import bitsandbytes as bnb
2
import torch
3
from transformers import (
4
AutoTokenizer,
5
AutoModel,
6
AutoModelForCausalLM,
7
BitsAndBytesConfig,
8
TrainingArguments,
9
GenerationConfig
10
)
11
12
import datasets
13
from datasets import Dataset, load_dataset
14
import pandas as pd
15
import os
16
import numpy as np
17
import itertools as it
18
import faiss
19
import re
20
import razdel
21
import more_itertools as mit
22
import time
23
from torch.multiprocessing import Pool, cpu_count, set_start_method
24
25
path_to_data = r'wiki dumps/wikimedia___wikipedia'
26
27
dataset = load_dataset(path_to_data,
28
data_dir="20231101.ru",
29
split="train[:10000]",
30
)
31
32
33
def clean_text(text):
34
text = re.sub(r'\s+', ' ', text) # Удаляем лишние пробелы и переносы строк
35
text = re.sub(r'[^\w\s,.!?]', '', text) # Удаляем специальные символы
36
return text.strip()
37
38
def split_into_chunks(text, title):
39
text = clean_text(text)
40
sentences = list(razdel.sentenize(text))
41
sents_5_idxs = [0] + [sent.stop + 1 for sent in sentences[4::5]] + [None]
42
sents_5_slices = [slice(start, stop) for start, stop in it.pairwise(sents_5_idxs)]
43
chunks = [title + " " + text[sl] for sl in sents_5_slices]
44
chunks = list(mit.constrained_batches(chunks, max_size=200, get_len=len, strict=False))
45
return [' '.join(chunk) for chunk in chunks]
46
47
def process_entry(entry):
48
chunks = split_into_chunks(entry['text'], entry['title'])
49
return [(entry['id'], entry['title'], chunk) for chunk in chunks]
50
51
start_time = time.time()
52
53
with Pool(cpu_count()) as p:
54
results = p.map(process_entry, dataset)
55
56
ids, titles, texts = zip(*[item for sublist in results for item in sublist])
57
new_dataset = Dataset.from_dict({'id': ids, 'title': titles, 'text': texts})
58
59
end_time = time.time()
60
total_time = end_time - start_time
61
62
print(f"Время подготовки датасета составило {total_time:.2f} секунд")
63
new_dataset
64
Dataset({
65
features: ['id', 'title', 'text'],
66
num_rows: 192814
67
})
68
69
model = SentenceTransformer('sergeyzh/rubert-tiny-turbo')
70
device = 'cuda' if torch.cuda.is_available() else 'cpu'
71
model.to(device)
72
73
def embed(batch):
74
embeddings = model.encode(batch['text'], convert_to_tensor=True, device=device)
75
batch['embeddings'] = embeddings.cpu().numpy()
76
return batch
77
78
start_time = time.time()
79
emb_dataset = new_dataset.map(embed, batched=True, batch_size=16)
80
end_time = time.time()
81
82
total_time = end_time - start_time
83
print(f"Время создания 50000 статей с помощью rubert-tiny-turbo составило {total_time:.2f} секунд")
84
print(f"Время создания 50000 статей с помощью rubert-tiny-turbo составило {total_time/60:.2f} минут")