f1dp

public

thanhtrucwy.21 Jan 25, 2025 Never 47

Python f1_dp 91 lines (80 loc) | 3.56 KB

Raw

1 def cal_f1score_dp(input_sentence, pred_sentence, gold_sentence):
2     """
3     Calculate Precision and Recall for text normalization.
4     Using Dynamic Programming to handle the case of length's unfit.
5  
6     TP, TN, FP, FN
7     TP: Những từ cần chuẩn hóa và mô hình đã chuẩn hóa đúng
8     TN: Những từ không cần chuẩn hóa và mô hình giữ nguyên
9     FP: Những từ không cần chuẩn hóa và mô hình chuẩn hóa nó (chuẩn hóa dư)
10     FN: Những từ cần chuẩn hóa và mô hình chuẩn hóa sai (giữ nguyên nó hoặc chuẩn hóa sai)
11     
12     Parameters:
13     - input_sentence: Original sentence (string).
14     - pred_sentence: Predicted normalized sentence (string).
15     - gold_sentence: Ground truth normalized sentence (string).
16     
17     Returns:
18     - A dictionary containing Precision, Recall, F1 score. Round 2.
19     """
20     import numpy as np
21  
22     # Tokenize sentences
23     input_tokens = input_sentence.split()
24     pred_tokens = pred_sentence.split()
25     gold_tokens = gold_sentence.split()
26  
27     n, m, p = len(input_tokens), len(pred_tokens), len(gold_tokens)
28  
29     # DP table
30     dp = np.zeros((n + 1, m + 1, p + 1))
31     tp, tn, fp, fn = 0, 0, 0, 0
32  
33     # Initialize base cases
34     for i in range(n + 1):
35         for j in range(m + 1):
36             for k in range(p + 1):
37                 if i == 0 and j == 0:
38                     dp[i][j][k] = k  # All remaining gold tokens are FN
39                 elif i == 0 and k == 0:
40                     dp[i][j][k] = j  # All remaining predicted tokens are FP
41                 elif j == 0 and k == 0:
42                     dp[i][j][k] = i  # Tokens from input_sentence are irrelevant
43  
44     # Fill DP table
45     for i in range(1, n + 1):
46         for j in range(1, m + 1):
47             for k in range(1, p + 1):
48                 if pred_tokens[j - 1] == gold_tokens[k - 1]:
49                     dp[i][j][k] = dp[i - 1][j - 1][k - 1]  # Match (TP or TN)
50                 else:
51                     dp[i][j][k] = min(
52                         dp[i - 1][j - 1][k - 1] + 1,  # Substitution (FP + FN)
53                         dp[i - 1][j][k] + 1,          # Deletion (FN)
54                         dp[i][j - 1][k] + 1,          # Insertion (FP),
55                         dp[i][j][k - 1] + 1           # Missing correction (FN)
56                     )
57  
58     # Backtrack to calculate TP, TN, FP, FN
59     i, j, k = n, m, p
60     while j > 0 or k > 0:
61         if j > 0 and k > 0 and pred_tokens[j - 1] == gold_tokens[k - 1]:
62             if pred_tokens[j - 1] in input_tokens:
63                 tn += 1  # Correct non-normalization
64             else:
65                 tp += 1  # Correct normalization
66             j -= 1
67             k -= 1
68         elif j > 0 and pred_tokens[j - 1] not in gold_tokens and pred_tokens[j - 1] not in input_tokens:
69             fp += 1  # Unnecessary normalization
70             j -= 1
71         elif k > 0 and gold_tokens[k - 1] not in pred_tokens:
72             if gold_tokens[k - 1] not in input_tokens:
73                 fn += 1  # Case 1: The model didn't normalize a required word
74             k -= 1
75         else:
76             j -= 1 if j > 0 else 0
77  
78     # Precision and Recall
79     precision = tp / (tp + fp) if tp + fp > 0 else 0
80     recall = tp / (tp + fn) if tp + fn > 0 else 0
81     f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
82  
83     return {
84         "precision": round(precision, 2),
85         "recall": round(recall, 2),
86         "f1 score": round(f1, 2),
87         "TP": tp,
88         "TN": tn,
89         "FP": fp,
90         "FN": fn,
91     }

1	def cal_f1score_dp(input_sentence, pred_sentence, gold_sentence):
2	"""
3	Calculate Precision and Recall for text normalization.
4	Using Dynamic Programming to handle the case of length's unfit.
5
6	TP, TN, FP, FN
7	TP: Những từ cần chuẩn hóa và mô hình đã chuẩn hóa đúng
8	TN: Những từ không cần chuẩn hóa và mô hình giữ nguyên
9	FP: Những từ không cần chuẩn hóa và mô hình chuẩn hóa nó (chuẩn hóa dư)
10	FN: Những từ cần chuẩn hóa và mô hình chuẩn hóa sai (giữ nguyên nó hoặc chuẩn hóa sai)
11
12	Parameters:
13	- input_sentence: Original sentence (string).
14	- pred_sentence: Predicted normalized sentence (string).
15	- gold_sentence: Ground truth normalized sentence (string).
16
17	Returns:
18	- A dictionary containing Precision, Recall, F1 score. Round 2.
19	"""
20	import numpy as np
21
22	# Tokenize sentences
23	input_tokens = input_sentence.split()
24	pred_tokens = pred_sentence.split()
25	gold_tokens = gold_sentence.split()
26
27	n, m, p = len(input_tokens), len(pred_tokens), len(gold_tokens)
28
29	# DP table
30	dp = np.zeros((n + 1, m + 1, p + 1))
31	tp, tn, fp, fn = 0, 0, 0, 0
32
33	# Initialize base cases
34	for i in range(n + 1):
35	for j in range(m + 1):
36	for k in range(p + 1):
37	if i == 0 and j == 0:
38	dp[i][j][k] = k # All remaining gold tokens are FN
39	elif i == 0 and k == 0:
40	dp[i][j][k] = j # All remaining predicted tokens are FP
41	elif j == 0 and k == 0:
42	dp[i][j][k] = i # Tokens from input_sentence are irrelevant
43
44	# Fill DP table
45	for i in range(1, n + 1):
46	for j in range(1, m + 1):
47	for k in range(1, p + 1):
48	if pred_tokens[j - 1] == gold_tokens[k - 1]:
49	dp[i][j][k] = dp[i - 1][j - 1][k - 1] # Match (TP or TN)
50	else:
51	dp[i][j][k] = min(
52	dp[i - 1][j - 1][k - 1] + 1, # Substitution (FP + FN)
53	dp[i - 1][j][k] + 1, # Deletion (FN)
54	dp[i][j - 1][k] + 1, # Insertion (FP),
55	dp[i][j][k - 1] + 1 # Missing correction (FN)
56	)
57
58	# Backtrack to calculate TP, TN, FP, FN
59	i, j, k = n, m, p
60	while j > 0 or k > 0:
61	if j > 0 and k > 0 and pred_tokens[j - 1] == gold_tokens[k - 1]:
62	if pred_tokens[j - 1] in input_tokens:
63	tn += 1 # Correct non-normalization
64	else:
65	tp += 1 # Correct normalization
66	j -= 1
67	k -= 1
68	elif j > 0 and pred_tokens[j - 1] not in gold_tokens and pred_tokens[j - 1] not in input_tokens:
69	fp += 1 # Unnecessary normalization
70	j -= 1
71	elif k > 0 and gold_tokens[k - 1] not in pred_tokens:
72	if gold_tokens[k - 1] not in input_tokens:
73	fn += 1 # Case 1: The model didn't normalize a required word
74	k -= 1
75	else:
76	j -= 1 if j > 0 else 0
77
78	# Precision and Recall
79	precision = tp / (tp + fp) if tp + fp > 0 else 0
80	recall = tp / (tp + fn) if tp + fn > 0 else 0
81	f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
82
83	return {
84	"precision": round(precision, 2),
85	"recall": round(recall, 2),
86	"f1 score": round(f1, 2),
87	"TP": tp,
88	"TN": tn,
89	"FP": fp,
90	"FN": fn,
91	}