Publications
2024
Martínez-Mendoza, Alicia; Jáñez-Martino, Francisco; Carofilis-Vasco, Andrés; Fernández-Robles, Laura; Alegre, Enrique; Fidalgo, Eduardo
Towards Multi-Class Smishing Detection: A Novel Feature Vector Approach and the Smishing-4C Dataset Artículo de revista
En: 2024.
Enlaces | BibTeX | Etiquetas: Multiclass Classification, Smishing Classification, Smishing-4C Dataset, SMS, Text classification
@article{martinez-mendoza_towards_2024,
title = {Towards Multi-Class Smishing Detection: A Novel Feature Vector Approach and the Smishing-4C Dataset},
author = {Alicia Martínez-Mendoza and Francisco Jáñez-Martino and Andrés Carofilis-Vasco and Laura Fernández-Robles and Enrique Alegre and Eduardo Fidalgo},
url = {https://besaya.infor.uva.es/sepln24/paper06.pdf},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
keywords = {Multiclass Classification, Smishing Classification, Smishing-4C Dataset, SMS, Text classification},
pubstate = {published},
tppubtype = {article}
}
Al-Nabki, Wesam; Fidalgo, Eduardo; Alegre, Enrique; Delany, Sarah Jane; Jáñez-Martino, Francisco
Classifying the content of online notepad services using active learning Artículo de revista
En: Journal of Intelligent Information Systems, pp. 1–27, 2024, (Publisher: Springer US).
Resumen | Enlaces | BibTeX | Etiquetas: Cybersecurity, Illegal Activities, machine learning, Pastebin, Text classification
@article{al-nabki_classifying_2024,
title = {Classifying the content of online notepad services using active learning},
author = {Wesam Al-Nabki and Eduardo Fidalgo and Enrique Alegre and Sarah Jane Delany and Francisco Jáñez-Martino},
url = {https://link.springer.com/article/10.1007/s10844-024-00902-8},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
journal = {Journal of Intelligent Information Systems},
pages = {1–27},
abstract = {This paper proposes a cascading classification system with Active Learning to identify suspicious activities on Pastebin. The model classifies texts into code snippets, readability, and suspicious or illegal activities. It introduces the INSPECT-3.8M dataset, containing 3.8 million labeled samples. This approach helps law enforcement agencies detect and block illegal content on Pastebin before it spreads.},
note = {Publisher: Springer US},
keywords = {Cybersecurity, Illegal Activities, machine learning, Pastebin, Text classification},
pubstate = {published},
tppubtype = {article}
}
2023
Jáñez-Martino, Francisco; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Fidalgo, Eduardo; Alegre, Enrique
Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach Artículo de revista
En: Applied Soft Computing, vol. 139, pp. 110226, 2023, (Publisher: Elsevier).
Resumen | Enlaces | BibTeX | Etiquetas: Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding
@article{janez-martino_classifying_2023,
title = {Classifying spam emails using agglomerative hierarchical clustering and a topic-based approach},
author = {Francisco Jáñez-Martino and Rocío Alaiz-Rodríguez and Víctor González-Castro and Eduardo Fidalgo and Enrique Alegre},
url = {https://www.sciencedirect.com/science/article/pii/S1568494623002442},
year = {2023},
date = {2023-01-01},
journal = {Applied Soft Computing},
volume = {139},
pages = {110226},
abstract = {This paper introduces two novel datasets, SPEMC-15K-E and SPEMC-15K-S, containing 15K spam emails each in English and Spanish. The emails are categorized into 11 classes using hierarchical clustering. Evaluation of 16 classification pipelines reveals that TF-IDF with Logistic Regression achieves the highest performance for the English dataset (F1 score of 0.953, accuracy of 94.6%), while TF-IDF with Naïve Bayes performs best for Spanish (F1 score of 0.945, accuracy of 98.5%). TF-IDF with LR is also the fastest for both languages.},
note = {Publisher: Elsevier},
keywords = {Hidden text, Image-based spam, Multi-classification, Spam detection, Term frequency, Text classification, Word embedding},
pubstate = {published},
tppubtype = {article}
}
Díaz, Daniel; Fernández-Robles, Laura; Al-Nabki, Wesam; Martínez-Mendoza, Alicia; Fidalgo, Eduardo; Alegre, Enrique; Carofilis-Vasco, Andrés
Authorship identification in text documents using BERT and POS features Artículo de revista
En: 5th International Conference on Applications of Intelligent Systems (Las Palmas de Gran Canaria, España), 2023.
Resumen | Enlaces | BibTeX | Etiquetas: authorship identification, BERT, POS features, Text classification
@article{diaz-ocampo_authorship_2023,
title = {Authorship identification in text documents using BERT and POS features},
author = {Daniel Díaz and Laura Fernández-Robles and Wesam Al-Nabki and Alicia Martínez-Mendoza and Eduardo Fidalgo and Enrique Alegre and Andrés Carofilis-Vasco},
url = {https://scholar.google.es/citations?view_op=view_citation&hl=es&user=yATJZvcAAAAJ&sortby=title&citation_for_view=yATJZvcAAAAJ:yB1At4FlUx8C},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
journal = {5th International Conference on Applications of Intelligent Systems (Las Palmas de Gran Canaria, España)},
abstract = {This paper enhances authorship identification by combining BERT embeddings with POS features, improving classification accuracy.},
keywords = {authorship identification, BERT, POS features, Text classification},
pubstate = {published},
tppubtype = {article}
}
2022
Redondo-Gutierrez, Luis Ángel; Jáñez-Martino, Francisco; Fidalgo, Eduardo; Alegre, Enrique; González-Castro, Víctor; Alaiz-Rodríguez, Rocío
Detecting malware using text documents extracted from spam email through machine learning Artículo de revista
En: Proceedings of the 22nd ACM Symposium on Document Engineering, pp. 1–4, 2022.
Resumen | Enlaces | BibTeX | Etiquetas: Malware Detection, NLP, Spam Email, Text classification
@article{redondo-gutierrez_detecting_2022,
title = {Detecting malware using text documents extracted from spam email through machine learning},
author = {Luis Ángel Redondo-Gutierrez and Francisco Jáñez-Martino and Eduardo Fidalgo and Enrique Alegre and Víctor González-Castro and Rocío Alaiz-Rodríguez},
url = {https://dl.acm.org/doi/abs/10.1145/3558100.3563854},
year = {2022},
date = {2022-01-01},
journal = {Proceedings of the 22nd ACM Symposium on Document Engineering},
pages = {1–4},
abstract = {This work introduces the "Spam Email Malware Detection - 600" (SEMD-600) dataset for detecting malware in spam emails using text analysis. It compares two text representation techniques (Bag of Words and TF-IDF) combined with three classifiers (SVM, Naive Bayes, and Logistic Regression). The combination of TF-IDF and Logistic Regression achieved the best performance, with a macro F1 score of 0.763.},
keywords = {Malware Detection, NLP, Spam Email, Text classification},
pubstate = {published},
tppubtype = {article}
}
2020
Molpeceres-Barrientos, Gonzalo; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Parnell, Andrew
Machine learning techniques for the detection of inappropriate erotic content in text Artículo de revista
En: International Journal of Computational Intelligence Systems, vol. 13, no 1, pp. 591–603, 2020, (Publisher: Springer Netherlands Dordrecht).
Resumen | Enlaces | BibTeX | Etiquetas: machine learning, Natural Language Processing, NLP, Text classification
@article{molpeceres-barrientos_machine_2020,
title = {Machine learning techniques for the detection of inappropriate erotic content in text},
author = {Gonzalo Molpeceres-Barrientos and Rocío Alaiz-Rodríguez and Víctor González-Castro and Andrew Parnell},
url = {https://link.springer.com/article/10.2991/ijcis.d.200519.003},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
journal = {International Journal of Computational Intelligence Systems},
volume = {13},
number = {1},
pages = {591–603},
abstract = {This study addresses the problem of detecting erotic or sexual content in text documents, specifically for protecting children online. Using Natural Language Processing (NLP) techniques, the authors evaluated twelve models combining different text encoders (Bag of Words, TF-IDF, and Word2vec) with various classifiers (SVM, Logistic Regression, k-NN, and Random Forest). The evaluation was conducted on a dataset created from Reddit. The best result was achieved using TF-IDF with an SVM classifier, which achieved an accuracy of 0.97 and an F-score of 0.96 (precision 0.96/recall 0.95). This demonstrates the feasibility of detecting erotic content and creating filters for minors or user preferences.},
note = {Publisher: Springer Netherlands Dordrecht},
keywords = {machine learning, Natural Language Processing, NLP, Text classification},
pubstate = {published},
tppubtype = {article}
}
2019
Alegre, Enrique
SUPERVISED MACHINE LEARNING FOR CLASSIFICATION, MINING, AND RANKING OF ILLEGAL WEB CONTENTS Tesis doctoral
UNIVERSITY OF LEÓN, 2019.
Resumen | Enlaces | BibTeX | Etiquetas: Darknet, Illegal Activities, Pastebin, Text classification, TOR Network
@phdthesis{alegre_supervised_2019,
title = {SUPERVISED MACHINE LEARNING FOR CLASSIFICATION, MINING, AND RANKING OF ILLEGAL WEB CONTENTS},
author = {Enrique Alegre},
url = {https://scholar.google.es/citations?view_op=view_citation&hl=es&user=yATJZvcAAAAJ&cstart=100&pagesize=100&sortby=title&citation_for_view=yATJZvcAAAAJ:ldfaerwXgEUC},
year = {2019},
date = {2019-01-01},
school = {UNIVERSITY OF LEÓN},
abstract = {This thesis introduces algorithms, methods, and datasets aimed at classifying, mining information, and ranking web domains or similar resources containing text. The focus is on detecting web content that may indicate illegal activities, particularly in the Tor Darknet and Online Notepad Services (ONS), like Pastebin. Motivated by a collaboration with INCIBE, the research addresses the identification of criminal content in these areas, based on the assumption that the Tor network harbors a significant amount of illicit activity.},
keywords = {Darknet, Illegal Activities, Pastebin, Text classification, TOR Network},
pubstate = {published},
tppubtype = {phdthesis}
}
Riesco, Adrián; Fidalgo, Eduardo; Al-Nabki, Wesam; Jáñez-Martino, Francisco; Alegre, Enrique
Classifying Pastebin content through the generation of PasteCC labeled dataset Proceedings Article
En: Hybrid Artificial Intelligent Systems: 14th International Conference, HAIS 2019, León, Spain, September 4–6, 2019, Proceedings 14, pp. 456–467, Springer International Publishing, 2019.
Resumen | Enlaces | BibTeX | Etiquetas: Cybercrime Detection, Logistic Regression, machine learning, Pastebin, Text classification, TF-IDF
@inproceedings{riesco_classifying_2019,
title = {Classifying Pastebin content through the generation of PasteCC labeled dataset},
author = {Adrián Riesco and Eduardo Fidalgo and Wesam Al-Nabki and Francisco Jáñez-Martino and Enrique Alegre},
url = {https://link.springer.com/chapter/10.1007/978-3-030-29859-3_39},
year = {2019},
date = {2019-01-01},
booktitle = {Hybrid Artificial Intelligent Systems: 14th International Conference, HAIS 2019, León, Spain, September 4–6, 2019, Proceedings 14},
pages = {456–467},
publisher = {Springer International Publishing},
abstract = {This paper presents the PasteCC_17K dataset, containing 17,640 text samples from Pastebin, classified into 15 categories, including 6 potentially illegal ones. The study evaluates different text representation techniques and classifiers, finding that TF-IDF with Logistic Regression offers the best performance, helping authorities detect suspicious content on Pastebin.},
keywords = {Cybercrime Detection, Logistic Regression, machine learning, Pastebin, Text classification, TF-IDF},
pubstate = {published},
tppubtype = {inproceedings}
}
2018
Joshi, Akanksha; Fidalgo, Eduardo; Alegre, Enrique; Al-Nabki, Wesam
Extractive Text Summarization in Dark Web: A Preliminary Study Artículo de revista
En: International Conference of Applications of Intelligent Systems, 2018.
Resumen | Enlaces | BibTeX | Etiquetas: Dark Web, Text classification, TOR Network
@article{joshi_extractive_2018,
title = {Extractive Text Summarization in Dark Web: A Preliminary Study},
author = {Akanksha Joshi and Eduardo Fidalgo and Enrique Alegre and Wesam Al-Nabki},
url = {https://scholar.google.es/citations?view_op=view_citation&hl=es&user=yATJZvcAAAAJ&cstart=20&pagesize=80&sortby=title&citation_for_view=yATJZvcAAAAJ:iH-uZ7U-co4C},
year = {2018},
date = {2018-01-01},
urldate = {2018-01-01},
journal = {International Conference of Applications of Intelligent Systems},
abstract = {This paper explores automatic text summarization applied to illegal content extracted from onion websites on the Tor network. The goals include evaluating the feasibility of summarizing such content, comparing summarization methods, and introducing a new dataset called "OWIDSumm," which contains manually curated summaries for 60 documents related to illicit services.},
keywords = {Dark Web, Text classification, TOR Network},
pubstate = {published},
tppubtype = {article}
}
2017
Al-Nabki, Wesam; Fidalgo, Eduardo; Alegre, Enrique; Paz-Centeno, Iván De
Classifying illegal activities on tor network based on web textual contents Artículo de revista
En: Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers, pp. 35–43, 2017.
Resumen | Enlaces | BibTeX | Etiquetas: Cybersecurity, Darknet Analysis, Logistic Regression, machine learning, Text classification, TF-IDF
@article{al_nabki_classifying_2017,
title = {Classifying illegal activities on tor network based on web textual contents},
author = {Wesam Al-Nabki and Eduardo Fidalgo and Enrique Alegre and Iván De Paz-Centeno},
url = {https://aclanthology.org/E17-1004/},
year = {2017},
date = {2017-01-01},
urldate = {2017-01-01},
journal = {Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers},
pages = {35–43},
abstract = {This paper introduces DUTA, a publicly available dataset of Darknet domains labeled into 26 classes. Using DUTA, a classification study was conducted with TF-IDF and supervised classifiers. Logistic Regression with TF-IDF achieved 96.6% accuracy and a 93.7% F1-score in detecting illegal activities, aiding potential law enforcement tools.},
keywords = {Cybersecurity, Darknet Analysis, Logistic Regression, machine learning, Text classification, TF-IDF},
pubstate = {published},
tppubtype = {article}
}