Publications
2023
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach Artículo de revista
En: Applied Soft Computing, vol. 139, pp. 110226, 2023, ISSN: 1568-4946.
Resumen | Enlaces | BibTeX | Etiquetas: Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding
@article{JANEZMARTINO2023110226b,
title = {Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://www.sciencedirect.com/science/article/pii/S1568494623002442},
doi = {https://doi.org/10.1016/j.asoc.2023.110226},
issn = {1568-4946},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
journal = {Applied Soft Computing},
volume = {139},
pages = {110226},
abstract = {Spam emails are unsolicited, annoying and sometimes harmful messages which may contain malware, phishing or hoaxes. Unlike most studies that address the design of efficient anti-spam filters, we approach the spam email problem from a different and novel perspective. Focusing on the needs of cybersecurity units, we follow a topic-based approach for addressing the classification of spam email into multiple categories. We propose SPEMC-15K-E and SPEMC-15K-S, two novel datasets with approximately 15K emails each in English and Spanish, respectively, and we label them using agglomerative hierarchical clustering into 11 classes. We evaluate 16 pipelines, combining four text representation techniques -Term Frequency-Inverse Document Frequency (TF-IDF), Bag of Words, Word2Vec and BERT- and four classifiers: Support Vector Machine, Näive Bayes, Random Forest and Logistic Regression. Experimental results show that the highest performance is achieved with TF-IDF and LR for the English dataset, with a F1 score of 0.953 and an accuracy of 94.6%, and while for the Spanish dataset, TF-IDF with NB yields a F1 score of 0.945 and 98.5% accuracy. Regarding the processing time, TF-IDF with LR leads to the fastest classification, processing an English and Spanish spam email in 2ms and 2.2ms on average, respectively.},
keywords = {Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding},
pubstate = {published},
tppubtype = {article}
}
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
A review of spam email detection: analysis of spammer strategies and the dataset shift problem Artículo de revista
En: Artificial Intelligence Review, vol. 56, no 2, pp. 1145–1173, 2023.
Resumen | Enlaces | BibTeX | Etiquetas:
@article{Jáñez-Martino2023,
title = {A review of spam email detection: analysis of spammer strategies and the dataset shift problem},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://doi.org/10.1007/s10462-022-10195-4},
doi = {10.1007/s10462-022-10195-4},
year = {2023},
date = {2023-01-01},
journal = {Artificial Intelligence Review},
volume = {56},
number = {2},
pages = {1145–1173},
abstract = {Spam emails have been traditionally seen as just annoying and unsolicited emails containing advertisements, but they increasingly include scams, malware or phishing. In order to ensure the security and integrity for the users, organisations and researchers aim to develop robust filters for spam email detection. Recently, most spam filters based on machine learning algorithms published in academic journals report very high performance, but users are still reporting a rising number of frauds and attacks via spam emails. Two main challenges can be found in this field: (a) it is a very dynamic environment prone to the dataset shift problem and (b) it suffers from the presence of an adversarial figure, i.e. the spammer. Unlike classical spam email reviews, this one is particularly focused on the problems that this constantly changing environment poses. Moreover, we analyse the different spammer strategies used for contaminating the emails, and we review the state-of-the-art techniques to develop filters based on machine learning. Finally, we empirically evaluate and present the consequences of ignoring the matter of dataset shift in this practical field. Experimental results show that this shift may lead to severe degradation in the estimated generalisation performance, with error rates reaching values up to 48.81%.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach Artículo de revista
En: Applied Soft Computing, vol. 139, pp. 110226, 2023, ISSN: 1568-4946.
Resumen | Enlaces | BibTeX | Etiquetas: Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding
@article{JANEZMARTINO2023110226,
title = {Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://www.sciencedirect.com/science/article/pii/S1568494623002442},
doi = {https://doi.org/10.1016/j.asoc.2023.110226},
issn = {1568-4946},
year = {2023},
date = {2023-01-01},
journal = {Applied Soft Computing},
volume = {139},
pages = {110226},
abstract = {Spam emails are unsolicited, annoying and sometimes harmful messages which may contain malware, phishing or hoaxes. Unlike most studies that address the design of efficient anti-spam filters, we approach the spam email problem from a different and novel perspective. Focusing on the needs of cybersecurity units, we follow a topic-based approach for addressing the classification of spam email into multiple categories. We propose SPEMC-15K-E and SPEMC-15K-S, two novel datasets with approximately 15K emails each in English and Spanish, respectively, and we label them using agglomerative hierarchical clustering into 11 classes. We evaluate 16 pipelines, combining four text representation techniques -Term Frequency-Inverse Document Frequency (TF-IDF), Bag of Words, Word2Vec and BERT- and four classifiers: Support Vector Machine, Näive Bayes, Random Forest and Logistic Regression. Experimental results show that the highest performance is achieved with TF-IDF and LR for the English dataset, with a F1 score of 0.953 and an accuracy of 94.6%, and while for the Spanish dataset, TF-IDF with NB yields a F1 score of 0.945 and 98.5% accuracy. Regarding the processing time, TF-IDF with LR leads to the fastest classification, processing an English and Spanish spam email in 2ms and 2.2ms on average, respectively.},
keywords = {Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding},
pubstate = {published},
tppubtype = {article}
}
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
A review of spam email detection: analysis of spammer strategies and the dataset shift problem Artículo de revista
En: Artificial Intelligence Review, vol. 56, no 2, pp. 1145–1173, 2023.
Resumen | Enlaces | BibTeX | Etiquetas:
@article{Jáñez-Martino2023b,
title = {A review of spam email detection: analysis of spammer strategies and the dataset shift problem},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://doi.org/10.1007/s10462-022-10195-4},
doi = {10.1007/s10462-022-10195-4},
year = {2023},
date = {2023-01-01},
journal = {Artificial Intelligence Review},
volume = {56},
number = {2},
pages = {1145–1173},
abstract = {Spam emails have been traditionally seen as just annoying and unsolicited emails containing advertisements, but they increasingly include scams, malware or phishing. In order to ensure the security and integrity for the users, organisations and researchers aim to develop robust filters for spam email detection. Recently, most spam filters based on machine learning algorithms published in academic journals report very high performance, but users are still reporting a rising number of frauds and attacks via spam emails. Two main challenges can be found in this field: (a) it is a very dynamic environment prone to the dataset shift problem and (b) it suffers from the presence of an adversarial figure, i.e. the spammer. Unlike classical spam email reviews, this one is particularly focused on the problems that this constantly changing environment poses. Moreover, we analyse the different spammer strategies used for contaminating the emails, and we review the state-of-the-art techniques to develop filters based on machine learning. Finally, we empirically evaluate and present the consequences of ignoring the matter of dataset shift in this practical field. Experimental results show that this shift may lead to severe degradation in the estimated generalisation performance, with error rates reaching values up to 48.81%.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach Artículo de revista
En: Applied Soft Computing, vol. 139, pp. 110226, 2023, ISSN: 1568-4946.
Resumen | Enlaces | BibTeX | Etiquetas: Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding
@article{JANEZMARTINO2023110226c,
title = {Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://www.sciencedirect.com/science/article/pii/S1568494623002442},
doi = {https://doi.org/10.1016/j.asoc.2023.110226},
issn = {1568-4946},
year = {2023},
date = {2023-01-01},
journal = {Applied Soft Computing},
volume = {139},
pages = {110226},
abstract = {Spam emails are unsolicited, annoying and sometimes harmful messages which may contain malware, phishing or hoaxes. Unlike most studies that address the design of efficient anti-spam filters, we approach the spam email problem from a different and novel perspective. Focusing on the needs of cybersecurity units, we follow a topic-based approach for addressing the classification of spam email into multiple categories. We propose SPEMC-15K-E and SPEMC-15K-S, two novel datasets with approximately 15K emails each in English and Spanish, respectively, and we label them using agglomerative hierarchical clustering into 11 classes. We evaluate 16 pipelines, combining four text representation techniques -Term Frequency-Inverse Document Frequency (TF-IDF), Bag of Words, Word2Vec and BERT- and four classifiers: Support Vector Machine, Näive Bayes, Random Forest and Logistic Regression. Experimental results show that the highest performance is achieved with TF-IDF and LR for the English dataset, with a F1 score of 0.953 and an accuracy of 94.6%, and while for the Spanish dataset, TF-IDF with NB yields a F1 score of 0.945 and 98.5% accuracy. Regarding the processing time, TF-IDF with LR leads to the fastest classification, processing an English and Spanish spam email in 2ms and 2.2ms on average, respectively.},
keywords = {Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding},
pubstate = {published},
tppubtype = {article}
}
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
A review of spam email detection: analysis of spammer strategies and the dataset shift problem Artículo de revista
En: Artificial Intelligence Review, vol. 56, no 2, pp. 1145–1173, 2023.
Resumen | Enlaces | BibTeX | Etiquetas:
@article{Jáñez-Martino2023c,
title = {A review of spam email detection: analysis of spammer strategies and the dataset shift problem},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://doi.org/10.1007/s10462-022-10195-4},
doi = {10.1007/s10462-022-10195-4},
year = {2023},
date = {2023-01-01},
journal = {Artificial Intelligence Review},
volume = {56},
number = {2},
pages = {1145–1173},
abstract = {Spam emails have been traditionally seen as just annoying and unsolicited emails containing advertisements, but they increasingly include scams, malware or phishing. In order to ensure the security and integrity for the users, organisations and researchers aim to develop robust filters for spam email detection. Recently, most spam filters based on machine learning algorithms published in academic journals report very high performance, but users are still reporting a rising number of frauds and attacks via spam emails. Two main challenges can be found in this field: (a) it is a very dynamic environment prone to the dataset shift problem and (b) it suffers from the presence of an adversarial figure, i.e. the spammer. Unlike classical spam email reviews, this one is particularly focused on the problems that this constantly changing environment poses. Moreover, we analyse the different spammer strategies used for contaminating the emails, and we review the state-of-the-art techniques to develop filters based on machine learning. Finally, we empirically evaluate and present the consequences of ignoring the matter of dataset shift in this practical field. Experimental results show that this shift may lead to severe degradation in the estimated generalisation performance, with error rates reaching values up to 48.81%.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2007
Castejón, M.; Alegre, E.; Barreiro, J.; Hernández, L. K.
On-line tool wear monitoring using geometric descriptors from digital images Artículo de revista
En: International Journal of Machine Tools and Manufacture, vol. 47, no 12, pp. 1847-1853, 2007, ISSN: 0890-6955.
Resumen | Enlaces | BibTeX | Etiquetas: Computer vision, Image classification, Monitoring, Tool wear
@article{CASTEJON20071847,
title = {On-line tool wear monitoring using geometric descriptors from digital images},
author = {M. Castejón and E. Alegre and J. Barreiro and L. K. Hernández},
url = {https://www.sciencedirect.com/science/article/pii/S0890695507000892},
doi = {https://doi.org/10.1016/j.ijmachtools.2007.04.001},
issn = {0890-6955},
year = {2007},
date = {2007-01-01},
journal = {International Journal of Machine Tools and Manufacture},
volume = {47},
number = {12},
pages = {1847-1853},
abstract = {A new method based on a computer vision and statistical learning system is proposed to estimate the wear level in cutting inserts in order to identify the time for its replacement. A CNC parallel lathe and a computer vision system have been used to obtain 1383 flank images. A binary image for each of the former wear flank images have been obtained by applying several pre-processing and segmenting operations. Every wear flank region has been described by means of nine geometrical descriptors. LDA (linear discriminant analysis) shows that three out of the nine descriptors provide the 98.63% of the necessary information to carry out the classification, which are eccentricity, extent and solidity. The result obtained using a finite mixture model approach shows the presence of three clusters using these descriptors, which correspond with low, medium and high wear level. A monitoring approach is performed using the tool wear evolution for each insert along machining and the discriminant analysis. This evolution represents the probability of belonging to each one of the wear classes (low, medium and high). The estimate of the wear level allows to replace the tool when the wear level is located at the end of the M class (medium), preventing that the tool enters into the H class (high).},
keywords = {Computer vision, Image classification, Monitoring, Tool wear},
pubstate = {published},
tppubtype = {article}
}