Untitled
public
Jul 09, 2024
Never
66
1 import bitsandbytes as bnb 2 import torch 3 from transformers import ( 4 AutoTokenizer, 5 AutoModel, 6 AutoModelForCausalLM, 7 BitsAndBytesConfig, 8 TrainingArguments, 9 GenerationConfig 10 ) 11 12 import datasets 13 from datasets import Dataset, load_dataset 14 import pandas as pd 15 import os 16 import numpy as np 17 import itertools as it 18 import faiss 19 import re 20 import razdel 21 import more_itertools as mit 22 import time 23 from torch.multiprocessing import Pool, cpu_count, set_start_method 24 25 path_to_data = r'wiki dumps/wikimedia___wikipedia' 26 27 dataset = load_dataset(path_to_data, 28 data_dir="20231101.ru", 29 split="train[:10000]", 30 ) 31 32 33 def clean_text(text): 34 text = re.sub(r'\s+', ' ', text) # Удаляем лишние пробелы и переносы строк 35 text = re.sub(r'[^\w\s,.!?]', '', text) # Удаляем специальные символы 36 return text.strip() 37 38 def split_into_chunks(text, title): 39 text = clean_text(text) 40 sentences = list(razdel.sentenize(text)) 41 sents_5_idxs = [0] + [sent.stop + 1 for sent in sentences[4::5]] + [None] 42 sents_5_slices = [slice(start, stop) for start, stop in it.pairwise(sents_5_idxs)] 43 chunks = [title + " " + text[sl] for sl in sents_5_slices] 44 chunks = list(mit.constrained_batches(chunks, max_size=200, get_len=len, strict=False)) 45 return [' '.join(chunk) for chunk in chunks] 46 47 def process_entry(entry): 48 chunks = split_into_chunks(entry['text'], entry['title']) 49 return [(entry['id'], entry['title'], chunk) for chunk in chunks] 50 51 start_time = time.time() 52 53 with Pool(cpu_count()) as p: 54 results = p.map(process_entry, dataset) 55 56 ids, titles, texts = zip(*[item for sublist in results for item in sublist]) 57 new_dataset = Dataset.from_dict({'id': ids, 'title': titles, 'text': texts}) 58 59 end_time = time.time() 60 total_time = end_time - start_time 61 62 print(f"Время подготовки датасета составило {total_time:.2f} секунд") 63 new_dataset 64 Dataset({ 65 features: ['id', 'title', 'text'], 66 num_rows: 192814 67 }) 68 69 model = SentenceTransformer('sergeyzh/rubert-tiny-turbo') 70 device = 'cuda' if torch.cuda.is_available() else 'cpu' 71 model.to(device) 72 73 def embed(batch): 74 embeddings = model.encode(batch['text'], convert_to_tensor=True, device=device) 75 batch['embeddings'] = embeddings.cpu().numpy() 76 return batch 77 78 start_time = time.time() 79 emb_dataset = new_dataset.map(embed, batched=True, batch_size=16) 80 end_time = time.time() 81 82 total_time = end_time - start_time 83 print(f"Время создания 50000 статей с помощью rubert-tiny-turbo составило {total_time:.2f} секунд") 84 print(f"Время создания 50000 статей с помощью rubert-tiny-turbo составило {total_time/60:.2f} минут")