Publications
2022
Redondo-Gutierrez, Luis Ángel; Jáñez-Martino, Francisco; Fidalgo, Eduardo; Alegre, Enrique; González-Castro, Víctor; Alaiz-Rodríguez, Rocío
Detecting malware using text documents extracted from spam email through machine learning Artículo de revista
En: Proceedings of the 22nd ACM Symposium on Document Engineering, pp. 1–4, 2022.
Resumen | Enlaces | BibTeX | Etiquetas: Malware Detection, NLP, Spam Email, Text classification
@article{redondo-gutierrez_detecting_2022,
title = {Detecting malware using text documents extracted from spam email through machine learning},
author = {Luis Ángel Redondo-Gutierrez and Francisco Jáñez-Martino and Eduardo Fidalgo and Enrique Alegre and Víctor González-Castro and Rocío Alaiz-Rodríguez},
url = {https://dl.acm.org/doi/abs/10.1145/3558100.3563854},
year = {2022},
date = {2022-01-01},
journal = {Proceedings of the 22nd ACM Symposium on Document Engineering},
pages = {1–4},
abstract = {This work introduces the "Spam Email Malware Detection - 600" (SEMD-600) dataset for detecting malware in spam emails using text analysis. It compares two text representation techniques (Bag of Words and TF-IDF) combined with three classifiers (SVM, Naive Bayes, and Logistic Regression). The combination of TF-IDF and Logistic Regression achieved the best performance, with a macro F1 score of 0.763.},
keywords = {Malware Detection, NLP, Spam Email, Text classification},
pubstate = {published},
tppubtype = {article}
}
2020
Molpeceres-Barrientos, Gonzalo; Alaiz-Rodríguez, Rocío; González-Castro, Víctor; Parnell, Andrew
Machine learning techniques for the detection of inappropriate erotic content in text Artículo de revista
En: International Journal of Computational Intelligence Systems, vol. 13, no 1, pp. 591–603, 2020, (Publisher: Springer Netherlands Dordrecht).
Resumen | Enlaces | BibTeX | Etiquetas: machine learning, Natural Language Processing, NLP, Text classification
@article{molpeceres-barrientos_machine_2020,
title = {Machine learning techniques for the detection of inappropriate erotic content in text},
author = {Gonzalo Molpeceres-Barrientos and Rocío Alaiz-Rodríguez and Víctor González-Castro and Andrew Parnell},
url = {https://link.springer.com/article/10.2991/ijcis.d.200519.003},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
journal = {International Journal of Computational Intelligence Systems},
volume = {13},
number = {1},
pages = {591–603},
abstract = {This study addresses the problem of detecting erotic or sexual content in text documents, specifically for protecting children online. Using Natural Language Processing (NLP) techniques, the authors evaluated twelve models combining different text encoders (Bag of Words, TF-IDF, and Word2vec) with various classifiers (SVM, Logistic Regression, k-NN, and Random Forest). The evaluation was conducted on a dataset created from Reddit. The best result was achieved using TF-IDF with an SVM classifier, which achieved an accuracy of 0.97 and an F-score of 0.96 (precision 0.96/recall 0.95). This demonstrates the feasibility of detecting erotic content and creating filters for minors or user preferences.},
note = {Publisher: Springer Netherlands Dordrecht},
keywords = {machine learning, Natural Language Processing, NLP, Text classification},
pubstate = {published},
tppubtype = {article}
}
Sánchez-Paniagua, Manuel; Fidalgo, Eduardo; González-Castro, Víctor; Alegre, Enrique
Impact of current phishing strategies in machine learning models for phishing detection Artículo de revista
En: 13th International Conference on Computational Intelligence in Security for Information Systems (CISIS), pp. 87–96, 2020.
Resumen | Enlaces | BibTeX | Etiquetas: machine learning, NLP, phishing detection, URL
@article{sanchez-paniagua_impact_2020,
title = {Impact of current phishing strategies in machine learning models for phishing detection},
author = {Manuel Sánchez-Paniagua and Eduardo Fidalgo and Víctor González-Castro and Enrique Alegre},
url = {https://link.springer.com/chapter/10.1007/978-3-030-57805-3_9},
year = {2020},
date = {2020-01-01},
journal = {13th International Conference on Computational Intelligence in Security for Information Systems (CISIS)},
pages = {87–96},
abstract = {Phishing is one of the most widespread attacks based on social engineering. The detection of Phishing using Machine Learning approaches is more robust than the blacklist-based ones, which need regular reports and updates. However, the datasets currently used for training the Supervised Learning approaches have some drawbacks. These datasets only have the landing page of legitimate domains and they do not include the login forms from the websites, which is the most common situation in a real case of Phishing. This makes the performance of Machine Learning-based models to drop, especially when they are tested using login pages.},
keywords = {machine learning, NLP, phishing detection, URL},
pubstate = {published},
tppubtype = {article}
}