Publications
2025
Jáñez-Martino, Francisco; Barrón-Cedeño, Alberto; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Muti, Arianna
On persuasion in spam email: A multi-granularity text analysis Artículo de revista
En: Expert Systems with Applications, vol. 265, pp. 125767, 2025, (Publisher: Pergamon).
Resumen | Enlaces | BibTeX | Etiquetas: Cybersecurity, machine learning, Natural Language Processing, Spam detection
@article{janez-martino_persuasion_2025,
title = {On persuasion in spam email: A multi-granularity text analysis},
author = {Francisco Jáñez-Martino and Alberto Barrón-Cedeño and Rocío Alaiz-Rodríguez and Víctor González-Castro and Arianna Muti},
url = {https://www.sciencedirect.com/science/article/pii/S0957417424026344},
year = {2025},
date = {2025-01-01},
journal = {Expert Systems with Applications},
volume = {265},
pages = {125767},
abstract = {This paper explores the use of supervised machine learning models to detect persuasion techniques in spam emails, addressing both binary classification (presence/absence of persuasion) and multilabel classification (identifying specific persuasion techniques). The research utilizes natural language processing and adapts propaganda detection methods from news articles, analyzing emails at full-text, sentence, and snippet levels. The study includes the development of a custom spam dataset and fine-tuning of RoBERTa-based models, ultimately aiming to enhance cybersecurity through better understanding of persuasion tactics in malicious emails.},
note = {Publisher: Pergamon},
keywords = {Cybersecurity, machine learning, Natural Language Processing, Spam detection},
pubstate = {published},
tppubtype = {article}
}
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
Spam email classification based on cybersecurity potential risk using natural language processing Artículo de revista
En: Knowledge-Based Systems, vol. 310, pp. 112939, 2025, (Publisher: Elsevier).
Resumen | Enlaces | BibTeX | Etiquetas: Cybersecurity, Email Classification, machine learning, Natural Language Processing, Spam detection
@article{janez-martino_spam_2025,
title = {Spam email classification based on cybersecurity potential risk using natural language processing},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://www.sciencedirect.com/science/article/pii/S0950705124015739},
year = {2025},
date = {2025-01-01},
journal = {Knowledge-Based Systems},
volume = {310},
pages = {112939},
abstract = {This study focuses on detecting spam emails, a key vector for cyberattacks. It introduces 56 features based on NLP techniques, grouped into five categories: Headers, Text, Attachments, URLs, and Protocols. A new dataset, SERC, was created for spam risk classification. Using binary classification and regression, the Random Forest classifier achieved the best performance (F1-Score of 0.914), and Random Forest Regressor had the lowest Mean Square Error (0.781). Features from the Headers and Text groups were found to be the most important.},
note = {Publisher: Elsevier},
keywords = {Cybersecurity, Email Classification, machine learning, Natural Language Processing, Spam detection},
pubstate = {published},
tppubtype = {article}
}
2024
Martino, Francisco Jáñez; Carofilis, Andrés; Rodríguez, Rocío Alaiz; Castro, Víctor González; Fidalgo, Eduardo; Alegre, Enrique
Spam hierarchical clustering for campaigns spotting and topic-based classification [Póster] Artículo de revista
En: 2024, (Publisher: Universidad de Sevilla. Escuela Técnica Superior de Ingeniería Informática).
Resumen | Enlaces | BibTeX | Etiquetas: Cybersecurity, Logistic Regression, Multi-classification, Spam detection
@article{janez_martino_spam_2024,
title = {Spam hierarchical clustering for campaigns spotting and topic-based classification [Póster]},
author = {Francisco Jáñez Martino and Andrés Carofilis and Rocío Alaiz Rodríguez and Víctor González Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://idus.us.es/items/9828eae7-9cec-4574-8863-99e9020e1770},
year = {2024},
date = {2024-01-01},
abstract = {This article develops spam email multiclassification systems for cybersecurity, using two datasets: SPEMC-15K-E (English) and SPEMC-15K-S (Spanish). The datasets are classified into eleven categories. The best results for English (F1-score: 0.953, 94.6% accuracy) were achieved with TF-IDF and Logistic Regression, while for Spanish, TF-IDF and Naïve Bayes achieved an F1-score of 0.945 and 98.5% accuracy. TF-IDF with Logistic Regression also had the fastest processing time (2ms per email for English and 2.2ms for Spanish).},
note = {Publisher: Universidad de Sevilla. Escuela Técnica Superior de Ingeniería Informática},
keywords = {Cybersecurity, Logistic Regression, Multi-classification, Spam detection},
pubstate = {published},
tppubtype = {article}
}
Díaz, Daniel; Al-Nabki, Wesam; Fernández-Robles, Laura; Alegre, Enrique; Fidalgo, Eduardo; Martínez-Mendoza, Alicia
SpamClus: An Agglomerative Clustering Algorithm for Spam Email Campaigns Detection Artículo de revista
En: International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security (NLPAICS 2024), 2024.
Resumen | Enlaces | BibTeX | Etiquetas: Agglomerative Clustering, Cybersecurity, Email Classification, machine learning, Spam detection, SpamClus
@article{diaz_spamclus_2024,
title = {SpamClus: An Agglomerative Clustering Algorithm for Spam Email Campaigns Detection},
author = {Daniel Díaz and Wesam Al-Nabki and Laura Fernández-Robles and Enrique Alegre and Eduardo Fidalgo and Alicia Martínez-Mendoza},
url = {https://scholar.google.es/citations?view_op=view_citation&hl=es&user=yATJZvcAAAAJ&cstart=100&pagesize=100&sortby=title&citation_for_view=yATJZvcAAAAJ:t7zJ5fGR-2UC},
year = {2024},
date = {2024-01-01},
journal = {International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security (NLPAICS 2024)},
abstract = {International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security (NLPAICS 2024)},
keywords = {Agglomerative Clustering, Cybersecurity, Email Classification, machine learning, Spam detection, SpamClus},
pubstate = {published},
tppubtype = {article}
}
2023
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach Artículo de revista
En: Applied Soft Computing, vol. 139, pp. 110226, 2023, ISSN: 1568-4946.
Resumen | Enlaces | BibTeX | Etiquetas: Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding
@article{JANEZMARTINO2023110226b,
title = {Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://www.sciencedirect.com/science/article/pii/S1568494623002442},
doi = {https://doi.org/10.1016/j.asoc.2023.110226},
issn = {1568-4946},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
journal = {Applied Soft Computing},
volume = {139},
pages = {110226},
abstract = {Spam emails are unsolicited, annoying and sometimes harmful messages which may contain malware, phishing or hoaxes. Unlike most studies that address the design of efficient anti-spam filters, we approach the spam email problem from a different and novel perspective. Focusing on the needs of cybersecurity units, we follow a topic-based approach for addressing the classification of spam email into multiple categories. We propose SPEMC-15K-E and SPEMC-15K-S, two novel datasets with approximately 15K emails each in English and Spanish, respectively, and we label them using agglomerative hierarchical clustering into 11 classes. We evaluate 16 pipelines, combining four text representation techniques -Term Frequency-Inverse Document Frequency (TF-IDF), Bag of Words, Word2Vec and BERT- and four classifiers: Support Vector Machine, Näive Bayes, Random Forest and Logistic Regression. Experimental results show that the highest performance is achieved with TF-IDF and LR for the English dataset, with a F1 score of 0.953 and an accuracy of 94.6%, and while for the Spanish dataset, TF-IDF with NB yields a F1 score of 0.945 and 98.5% accuracy. Regarding the processing time, TF-IDF with LR leads to the fastest classification, processing an English and Spanish spam email in 2ms and 2.2ms on average, respectively.},
keywords = {Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding},
pubstate = {published},
tppubtype = {article}
}
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach Artículo de revista
En: Applied Soft Computing, vol. 139, pp. 110226, 2023, (Publisher: Elsevier).
Resumen | Enlaces | BibTeX | Etiquetas: Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding
@article{janez-martino_classifying_2023,
title = {Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://www.sciencedirect.com/science/article/pii/S1568494623002442},
year = {2023},
date = {2023-01-01},
journal = {Applied Soft Computing},
volume = {139},
pages = {110226},
abstract = {This paper introduces two novel datasets, SPEMC-15K-E and SPEMC-15K-S, containing 15K spam emails each in English and Spanish. The emails are categorized into 11 classes using hierarchical clustering. Evaluation of 16 classification pipelines reveals that TF-IDF with Logistic Regression achieves the highest performance for the English dataset (F1 score of 0.953, accuracy of 94.6%), while TF-IDF with Naïve Bayes performs best for Spanish (F1 score of 0.945, accuracy of 98.5%). TF-IDF with LR is also the fastest for both languages.},
note = {Publisher: Elsevier},
keywords = {Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding},
pubstate = {published},
tppubtype = {article}
}
Mendoza, Alicia Martínez; Martino, Francisco Jáñez; Rodríguez, Rocío Aláiz; Castro, Víctor González; Fernández, Eduardo Fidalgo; Alegre, Enrique
A survey on spam detection, spammer strategies and the dataset shift problem Artículo de revista
En: Actas de las VIII Jornadas Nacionales de Investigación en Ciberseguridad: Vigo, 21 a 23 de junio de 2023, pp. 485–486, 2023, (Publisher: Universidade de Vigo).
Resumen | Enlaces | BibTeX | Etiquetas: dataset shift, Spam detection, spammer strategies
@article{martinez_mendoza_survey_2023,
title = {A survey on spam detection, spammer strategies and the dataset shift problem},
author = {Alicia Martínez Mendoza and Francisco Jáñez Martino and Rocío Aláiz Rodríguez and Víctor González Castro and Eduardo Fidalgo Fernández and Enrique Alegre},
url = {https://dialnet.unirioja.es/servlet/articulo?codigo=9044942},
year = {2023},
date = {2023-01-01},
journal = {Actas de las VIII Jornadas Nacionales de Investigación en Ciberseguridad: Vigo, 21 a 23 de junio de 2023},
pages = {485–486},
abstract = {Actas de las VIII Jornadas Nacionales de Investigación en Ciberseguridad: Vigo, 21 a 23 de junio de 2023},
note = {Publisher: Universidade de Vigo},
keywords = {dataset shift, Spam detection, spammer strategies},
pubstate = {published},
tppubtype = {article}
}
Martino, Francisco Jáñez; Rodríguez, Rocío Alaiz; Castro, Víctor González; Fidalgo, Eduardo; Alegre, Enrique
A review of spam email detection: analysis of spammer strategies and the dataset shift problem Artículo de revista
En: Artificial Intelligence Review, vol. 56, no 2, pp. 1145–1173, 2023, (Publisher: Springer Netherlands Dordrecht).
Resumen | Enlaces | BibTeX | Etiquetas: Cybersecurity, dataset shift, machine learning, Spam detection
@article{janez_martino_review_2023,
title = {A review of spam email detection: analysis of spammer strategies and the dataset shift problem},
author = {Francisco Jáñez Martino and Rocío Alaiz Rodríguez and Víctor González Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://link.springer.com/article/10.1007/s10462-022-10195-4},
year = {2023},
date = {2023-01-01},
journal = {Artificial Intelligence Review},
volume = {56},
number = {2},
pages = {1145–1173},
abstract = {Spam emails, which once were mainly an annoyance, now increasingly contain scams, malware, and phishing attempts. Despite high-performing spam filters based on machine learning, users continue to report rising incidents of fraud and attacks via spam. This paper highlights two key challenges in spam email detection: the dynamic nature of the environment, leading to dataset shift, and the presence of adversarial actors (spammers). The review focuses on the impact of these challenges and examines various spammer strategies and state-of-the-art techniques for developing robust filters. Experimental results show that ignoring dataset shift can severely degrade the performance of spam filters, leading to high error rates.},
note = {Publisher: Springer Netherlands Dordrecht},
keywords = {Cybersecurity, dataset shift, machine learning, Spam detection},
pubstate = {published},
tppubtype = {article}
}