Publications
2024
Al-Nabki, Mhd Wesam; Fidalgo, Eduardo; Alegre, Enrique; Delany, Sarah Jane; Jáñez-Martino, Francisco
Classifying the content of online notepad services using active learning Artículo de revista
En: Journal of Intelligent Information Systems, pp. 1–27, 2024, (Publisher: Springer US).
Resumen | Enlaces | BibTeX | Etiquetas: Cybersecurity, Illegal Activities, machine learning, Pastebin, Text classification
@article{al-nabki_classifying_2024,
title = {Classifying the content of online notepad services using active learning},
author = {Mhd Wesam Al-Nabki and Eduardo Fidalgo and Enrique Alegre and Sarah Jane Delany and Francisco Jáñez-Martino},
url = {https://link.springer.com/article/10.1007/s10844-024-00902-8},
year = {2024},
date = {2024-01-01},
journal = {Journal of Intelligent Information Systems},
pages = {1–27},
abstract = {This paper proposes a cascading classification system with Active Learning to identify suspicious activities on Pastebin. The model classifies texts into code snippets, readability, and suspicious or illegal activities. It introduces the INSPECT-3.8M dataset, containing 3.8 million labeled samples. This approach helps law enforcement agencies detect and block illegal content on Pastebin before it spreads.},
note = {Publisher: Springer US},
keywords = {Cybersecurity, Illegal Activities, machine learning, Pastebin, Text classification},
pubstate = {published},
tppubtype = {article}
}
Martínez-Mendoza, Alicia; Jáñez-Martino, Francisco; Carofilis, Andrés; Fernández-Robles, Laura; Alegre, Enrique; Fidalgo, Eduardo
Towards Multi-Class Smishing Detection: A Novel Feature Vector Approach and the Smishing-4C Dataset Artículo de revista
En: 2024.
Enlaces | BibTeX | Etiquetas: Multiclass Classification, Smishing Classification, Smishing-4C Dataset, SMS, Text classification
@article{martinez-mendoza_towards_2024,
title = {Towards Multi-Class Smishing Detection: A Novel Feature Vector Approach and the Smishing-4C Dataset},
author = {Alicia Martínez-Mendoza and Francisco Jáñez-Martino and Andrés Carofilis and Laura Fernández-Robles and Enrique Alegre and Eduardo Fidalgo},
url = {https://besaya.infor.uva.es/sepln24/paper06.pdf},
year = {2024},
date = {2024-01-01},
keywords = {Multiclass Classification, Smishing Classification, Smishing-4C Dataset, SMS, Text classification},
pubstate = {published},
tppubtype = {article}
}
2023
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach Artículo de revista
En: Applied Soft Computing, vol. 139, pp. 110226, 2023, ISSN: 1568-4946.
Resumen | Enlaces | BibTeX | Etiquetas: Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding
@article{JANEZMARTINO2023110226b,
title = {Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://www.sciencedirect.com/science/article/pii/S1568494623002442},
doi = {https://doi.org/10.1016/j.asoc.2023.110226},
issn = {1568-4946},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
journal = {Applied Soft Computing},
volume = {139},
pages = {110226},
abstract = {Spam emails are unsolicited, annoying and sometimes harmful messages which may contain malware, phishing or hoaxes. Unlike most studies that address the design of efficient anti-spam filters, we approach the spam email problem from a different and novel perspective. Focusing on the needs of cybersecurity units, we follow a topic-based approach for addressing the classification of spam email into multiple categories. We propose SPEMC-15K-E and SPEMC-15K-S, two novel datasets with approximately 15K emails each in English and Spanish, respectively, and we label them using agglomerative hierarchical clustering into 11 classes. We evaluate 16 pipelines, combining four text representation techniques -Term Frequency-Inverse Document Frequency (TF-IDF), Bag of Words, Word2Vec and BERT- and four classifiers: Support Vector Machine, Näive Bayes, Random Forest and Logistic Regression. Experimental results show that the highest performance is achieved with TF-IDF and LR for the English dataset, with a F1 score of 0.953 and an accuracy of 94.6%, and while for the Spanish dataset, TF-IDF with NB yields a F1 score of 0.945 and 98.5% accuracy. Regarding the processing time, TF-IDF with LR leads to the fastest classification, processing an English and Spanish spam email in 2ms and 2.2ms on average, respectively.},
keywords = {Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding},
pubstate = {published},
tppubtype = {article}
}
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach Artículo de revista
En: Applied Soft Computing, vol. 139, pp. 110226, 2023, (Publisher: Elsevier).
Resumen | Enlaces | BibTeX | Etiquetas: Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding
@article{janez-martino_classifying_2023,
title = {Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://www.sciencedirect.com/science/article/pii/S1568494623002442},
year = {2023},
date = {2023-01-01},
journal = {Applied Soft Computing},
volume = {139},
pages = {110226},
abstract = {This paper introduces two novel datasets, SPEMC-15K-E and SPEMC-15K-S, containing 15K spam emails each in English and Spanish. The emails are categorized into 11 classes using hierarchical clustering. Evaluation of 16 classification pipelines reveals that TF-IDF with Logistic Regression achieves the highest performance for the English dataset (F1 score of 0.953, accuracy of 94.6%), while TF-IDF with Naïve Bayes performs best for Spanish (F1 score of 0.945, accuracy of 98.5%). TF-IDF with LR is also the fastest for both languages.},
note = {Publisher: Elsevier},
keywords = {Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding},
pubstate = {published},
tppubtype = {article}
}
Díaz-Ocampo, Alicia Martinez Wesam Al-Nabki Daniel; Robles, Laura Fernández; Fidalgo, Eduardo; Alegre, Enrique; Vasco, Andres Carofilis
Authorship identification in text documents using BERT and POS features Artículo de revista
En: 5th International Conference on Applications of Intelligent Systems (Las Palmas de Gran Canaria, España), 2023.
Resumen | Enlaces | BibTeX | Etiquetas: authorship identification, BERT, POS features, Text classification
@article{daniel_diaz-ocampo_authorship_2023,
title = {Authorship identification in text documents using BERT and POS features},
author = {Alicia Martinez Wesam Al-Nabki Daniel Díaz-Ocampo and Laura Fernández Robles and Eduardo Fidalgo and Enrique Alegre and Andres Carofilis Vasco},
url = {https://scholar.google.es/citations?view_op=view_citation&hl=es&user=yATJZvcAAAAJ&sortby=title&citation_for_view=yATJZvcAAAAJ:yB1At4FlUx8C},
year = {2023},
date = {2023-01-01},
journal = {5th International Conference on Applications of Intelligent Systems (Las Palmas de Gran Canaria, España)},
abstract = {This paper enhances authorship identification by combining BERT embeddings with POS features, improving classification accuracy.},
keywords = {authorship identification, BERT, POS features, Text classification},
pubstate = {published},
tppubtype = {article}
}
2022
Redondo-Gutierrez, Luis Ángel; Jáñez-Martino, Francisco; Fidalgo, Eduardo; Alegre, Enrique; González-Castro, Víctor; Alaiz-Rodríguez, Rocío
Detecting malware using text documents extracted from spam email through machine learning Artículo de revista
En: Proceedings of the 22nd ACM Symposium on Document Engineering, pp. 1–4, 2022.
Resumen | Enlaces | BibTeX | Etiquetas: Malware Detection, NLP, Spam Email, Text classification
@article{redondo-gutierrez_detecting_2022,
title = {Detecting malware using text documents extracted from spam email through machine learning},
author = {Luis Ángel Redondo-Gutierrez and Francisco Jáñez-Martino and Eduardo Fidalgo and Enrique Alegre and Víctor González-Castro and Rocío Alaiz-Rodríguez},
url = {https://dl.acm.org/doi/abs/10.1145/3558100.3563854},
year = {2022},
date = {2022-01-01},
journal = {Proceedings of the 22nd ACM Symposium on Document Engineering},
pages = {1–4},
abstract = {This work introduces the "Spam Email Malware Detection - 600" (SEMD-600) dataset for detecting malware in spam emails using text analysis. It compares two text representation techniques (Bag of Words and TF-IDF) combined with three classifiers (SVM, Naive Bayes, and Logistic Regression). The combination of TF-IDF and Logistic Regression achieved the best performance, with a macro F1 score of 0.763.},
keywords = {Malware Detection, NLP, Spam Email, Text classification},
pubstate = {published},
tppubtype = {article}
}
2019
Riesco, Adrián; Fidalgo, Eduardo; Al-Nabki, Mhd Wesam; Jáñez-Martino, Francisco; Alegre, Enrique
Classifying Pastebin content through the generation of PasteCC labeled dataset Proceedings Article
En: Hybrid Artificial Intelligent Systems: 14th International Conference, HAIS 2019, León, Spain, September 4–6, 2019, Proceedings 14, pp. 456–467, Springer International Publishing, 2019.
Resumen | Enlaces | BibTeX | Etiquetas: Cybercrime Detection, Logistic Regression, machine learning, Pastebin, Text classification, TF-IDF
@inproceedings{riesco_classifying_2019,
title = {Classifying Pastebin content through the generation of PasteCC labeled dataset},
author = {Adrián Riesco and Eduardo Fidalgo and Mhd Wesam Al-Nabki and Francisco Jáñez-Martino and Enrique Alegre},
url = {https://link.springer.com/chapter/10.1007/978-3-030-29859-3_39},
year = {2019},
date = {2019-01-01},
booktitle = {Hybrid Artificial Intelligent Systems: 14th International Conference, HAIS 2019, León, Spain, September 4–6, 2019, Proceedings 14},
pages = {456–467},
publisher = {Springer International Publishing},
abstract = {This paper presents the PasteCC_17K dataset, containing 17,640 text samples from Pastebin, classified into 15 categories, including 6 potentially illegal ones. The study evaluates different text representation techniques and classifiers, finding that TF-IDF with Logistic Regression offers the best performance, helping authorities detect suspicious content on Pastebin.},
keywords = {Cybercrime Detection, Logistic Regression, machine learning, Pastebin, Text classification, TF-IDF},
pubstate = {published},
tppubtype = {inproceedings}
}
GUTIÉRREZ, DR ENRIQUE ALEGRE
SUPERVISED MACHINE LEARNING FOR CLASSIFICATION, MINING, AND RANKING OF ILLEGAL WEB CONTENTS Tesis doctoral
UNIVERSITY OF LEÓN, 2019.
Resumen | Enlaces | BibTeX | Etiquetas: Darknet, Illegal Activities, Pastebin, Text classification, TOR Network
@phdthesis{gutierrez_supervised_2019,
title = {SUPERVISED MACHINE LEARNING FOR CLASSIFICATION, MINING, AND RANKING OF ILLEGAL WEB CONTENTS},
author = {DR ENRIQUE ALEGRE GUTIÉRREZ},
url = {https://scholar.google.es/citations?view_op=view_citation&hl=es&user=yATJZvcAAAAJ&cstart=100&pagesize=100&sortby=title&citation_for_view=yATJZvcAAAAJ:ldfaerwXgEUC},
year = {2019},
date = {2019-01-01},
school = {UNIVERSITY OF LEÓN},
abstract = {This thesis introduces algorithms, methods, and datasets aimed at classifying, mining information, and ranking web domains or similar resources containing text. The focus is on detecting web content that may indicate illegal activities, particularly in the Tor Darknet and Online Notepad Services (ONS), like Pastebin. Motivated by a collaboration with INCIBE, the research addresses the identification of criminal content in these areas, based on the assumption that the Tor network harbors a significant amount of illicit activity.},
keywords = {Darknet, Illegal Activities, Pastebin, Text classification, TOR Network},
pubstate = {published},
tppubtype = {phdthesis}
}
2017
Nabki, Mhd Wesam Al; Fidalgo, Eduardo; Alegre, Enrique; Paz, Ivan De
Classifying illegal activities on tor network based on web textual contents Artículo de revista
En: Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers, pp. 35–43, 2017.
Resumen | Enlaces | BibTeX | Etiquetas: Cybersecurity, Darknet Analysis, Logistic Regression, machine learning, Text classification, TF-IDF
@article{al_nabki_classifying_2017,
title = {Classifying illegal activities on tor network based on web textual contents},
author = {Mhd Wesam Al Nabki and Eduardo Fidalgo and Enrique Alegre and Ivan De Paz},
url = {https://aclanthology.org/E17-1004/},
year = {2017},
date = {2017-01-01},
journal = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers},
pages = {35–43},
abstract = {This paper introduces DUTA, a publicly available dataset of Darknet domains labeled into 26 classes. Using DUTA, a classification study was conducted with TF-IDF and supervised classifiers. Logistic Regression with TF-IDF achieved 96.6% accuracy and a 93.7% F1-score in detecting illegal activities, aiding potential law enforcement tools.},
keywords = {Cybersecurity, Darknet Analysis, Logistic Regression, machine learning, Text classification, TF-IDF},
pubstate = {published},
tppubtype = {article}
}