如何在 R 中截断一个字符串向量?
数据分析中最困难的问题是清理脏数据。大多数情况下,数据以脏乱的形式呈现,而其中一种脏乱是字符串向量在某个字符后带有不必要的数值。因此,为了截断某个字符串向量在某个字符后,我们可以使用 str_split(来自 stringr 包)和 sapply 函数,如下例所示。
library(stringr)
示例
x1<-sample(c("India#21","China#42","UK#14","Japan#22","United States#25","Egpyt#27","Sudan#36"),100,replace=TRUE) x1
输出
[1] "China#42" "Egpyt#27" "India#21" [4] "China#42" "United States#25" "Japan#22" [7] "Egpyt#27" "Japan#22" "Egpyt#27" [10] "India#21" "India#21" "India#21" [13] "China#42" "Japan#22" "UK#14" [16] "India#21" "India#21" "China#42" [19] "United States#25" "Japan#22" "Sudan#36" [22] "China#42" "United States#25" "United States#25" [25] "Sudan#36" "India#21" "India#21" [28] "Sudan#36" "Egpyt#27" "Japan#22" [31] "UK#14" "UK#14" "UK#14" [34] "United States#25" "United States#25" "UK#14" [37] "Egpyt#27" "Egpyt#27" "India#21" [40] "India#21" "UK#14" "China#42" [43] "UK#14" "United States#25" "India#21" [46] "Egpyt#27" "Japan#22" "India#21" [49] "Japan#22" "UK#14" "Egpyt#27" [52] "Sudan#36" "Japan#22" "United States#25" [55] "Japan#22" "UK#14" "Sudan#36" [58] "Sudan#36" "Egpyt#27" "Sudan#36" [61] "India#21" "China#42" "Egpyt#27" [64] "Sudan#36" "Sudan#36" "Egpyt#27" [67] "China#42" "Japan#22" "Egpyt#27" [70] "China#42" "India#21" "United States#25" [73] "Egpyt#27" "United States#25" "India#21" [76] "Sudan#36" "Sudan#36" "India#21" [79] "Japan#22" "India#21" "Sudan#36" [82] "United States#25" "China#42" "China#42" [85] "Japan#22" "Egpyt#27" "China#42" [88] "Sudan#36" "United States#25" "United States#25" [91] "India#21" "Japan#22" "United States#25" [94] "China#42" "Japan#22" "Japan#22" [97] "Japan#22" "UK#14" "China#42" [100] "China#42"
示例
sapply(str_split(x1,"#",),'[',1)
输出
[1] "China" "Egpyt" "India" "China" [5] "United States" "Japan" "Egpyt" "Japan" [9] "Egpyt" "India" "India" "India" [13] "China" "Japan" "UK" "India" [17] "India" "China" "United States" "Japan" [21] "Sudan" "China" "United States" "United States" [25] "Sudan" "India" "India" "Sudan" [29] "Egpyt" "Japan" "UK" "UK" [33] "UK" "United States" "United States" "UK" [37] "Egpyt" "Egpyt" "India" "India" [41] "UK" "China" "UK" "United States" [45] "India" "Egpyt" "Japan" "India" [49] "Japan" "UK" "Egpyt" "Sudan" [53] "Japan" "United States" "Japan" "UK" [57] "Sudan" "Sudan" "Egpyt" "Sudan" [61] "India" "China" "Egpyt" "Sudan" [65] "Sudan" "Egpyt" "China" "Japan" [69] "Egpyt" "China" "India" "United States" [73] "Egpyt" "United States" "India" "Sudan" [77] "Sudan" "India" "Japan" "India" [81] "Sudan" "United States" "China" "China" [85] "Japan" "Egpyt" "China" "Sudan" [89] "United States" "United States" "India" "Japan" [93] "United States" "China" "Japan" "Japan" [97] "Japan" "UK" "China" "China"
示例
x2<-sample(c("[email protected]","[email protected]","[email protected]","[email protected]","[email protected]","[email protected]","[email protected]","[email protected]"),100,replace=TRUE) x2
输出
[1] "[email protected]" "[email protected]" "[email protected]" [4] "[email protected]" "[email protected]" "[email protected]" [7] "[email protected]" "[email protected]" "[email protected]" [10] "[email protected]" "[email protected]" "[email protected]" [13] "[email protected]" "[email protected]" "[email protected]" [16] "[email protected]" "[email protected]" "[email protected]" [19] "[email protected]" "[email protected]" "[email protected]" [22] "[email protected]" "[email protected]" "[email protected]" [25] "[email protected]" "[email protected]" "[email protected]" [28] "[email protected]" "[email protected]" "[email protected]" [31] "[email protected]" "[email protected]" "[email protected]" [34] "[email protected]" "[email protected]" "[email protected]" [37] "[email protected]" "[email protected]" "[email protected]" [40] "[email protected]" "[email protected]" "[email protected]" [43] "[email protected]" "[email protected]" "[email protected]" [46] "[email protected]" "[email protected]" "[email protected]" [49] "[email protected]" "[email protected]" "[email protected]" [52] "[email protected]" "[email protected]" "[email protected]" [55] "[email protected]" "[email protected]" "[email protected]" [58] "[email protected]" "[email protected]" "[email protected]" [61] "[email protected]" "[email protected]" "[email protected]" [64] "[email protected]" "[email protected]" "[email protected]" [67] "[email protected]" "[email protected]" "[email protected]" [70] "[email protected]" "[email protected]" "[email protected]" [73] "[email protected]" "[email protected]" "[email protected]" [76] "[email protected]" "[email protected]" "[email protected]" [79] "[email protected]" "[email protected]" "[email protected]" [82] "[email protected]" "[email protected]" "[email protected]" [85] "[email protected]" "[email protected]" "[email protected]" [88] "[email protected]" "[email protected]" "[email protected]" [91] "[email protected]" "[email protected]" "[email protected]" [94] "[email protected]" "[email protected]" "[email protected]" [97] "[email protected]" "[email protected]" "[email protected]" [100] "[email protected]"
示例
sapply(str_split(x2,"@",),'[',1)
输出
[1] "supriya" "rahul" "krishna" "shobhit" "nizam" "shobhit" "nizam" [8] "surbhi" "rushi" "rushi" "rushi" "rushi" "krishna" "shobhit" [15] "ujjal" "nizam" "supriya" "ujjal" "ujjal" "supriya" "rahul" [22] "shobhit" "krishna" "nizam" "shobhit" "rushi" "rushi" "ujjal" [29] "ujjal" "ujjal" "supriya" "rahul" "ujjal" "shobhit" "krishna" [36] "krishna" "shobhit" "surbhi" "nizam" "surbhi" "ujjal" "shobhit" [43] "ujjal" "krishna" "supriya" "ujjal" "supriya" "ujjal" "ujjal" [50] "rushi" "krishna" "rahul" "nizam" "rushi" "nizam" "surbhi" [57] "rahul" "supriya" "nizam" "shobhit" "rahul" "shobhit" "supriya" [64] "shobhit" "rahul" "shobhit" "ujjal" "supriya" "nizam" "surbhi" [71] "rushi" "rushi" "rushi" "supriya" "surbhi" "nizam" "rushi" [78] "supriya" "nizam" "rahul" "rahul" "surbhi" "rushi" "ujjal" [85] "rahul" "rushi" "rushi" "ujjal" "ujjal" "nizam" "supriya" [92] "surbhi" "nizam" "surbhi" "supriya" "shobhit" "supriya" "rahul" [99] "nizam" "rushi"
广告