Publications
2024
Al-Nabki, Wesam; Fidalgo, Eduardo; Alegre, Enrique; Delany, Sarah Jane; Jáñez-Martino, Francisco
Classifying the content of online notepad services using active learning Artículo de revista
En: Journal of Intelligent Information Systems, pp. 1–27, 2024, (Publisher: Springer US).
Resumen | Enlaces | BibTeX | Etiquetas: Cybersecurity, Illegal Activities, machine learning, Pastebin, Text classification
@article{al-nabki_classifying_2024,
title = {Classifying the content of online notepad services using active learning},
author = {Wesam Al-Nabki and Eduardo Fidalgo and Enrique Alegre and Sarah Jane Delany and Francisco Jáñez-Martino},
url = {https://link.springer.com/article/10.1007/s10844-024-00902-8},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
journal = {Journal of Intelligent Information Systems},
pages = {1–27},
abstract = {This paper proposes a cascading classification system with Active Learning to identify suspicious activities on Pastebin. The model classifies texts into code snippets, readability, and suspicious or illegal activities. It introduces the INSPECT-3.8M dataset, containing 3.8 million labeled samples. This approach helps law enforcement agencies detect and block illegal content on Pastebin before it spreads.},
note = {Publisher: Springer US},
keywords = {Cybersecurity, Illegal Activities, machine learning, Pastebin, Text classification},
pubstate = {published},
tppubtype = {article}
}
2019
Alegre, Enrique
SUPERVISED MACHINE LEARNING FOR CLASSIFICATION, MINING, AND RANKING OF ILLEGAL WEB CONTENTS Tesis doctoral
UNIVERSITY OF LEÓN, 2019.
Resumen | Enlaces | BibTeX | Etiquetas: Darknet, Illegal Activities, Pastebin, Text classification, TOR Network
@phdthesis{alegre_supervised_2019,
title = {SUPERVISED MACHINE LEARNING FOR CLASSIFICATION, MINING, AND RANKING OF ILLEGAL WEB CONTENTS},
author = {Enrique Alegre},
url = {https://scholar.google.es/citations?view_op=view_citation&hl=es&user=yATJZvcAAAAJ&cstart=100&pagesize=100&sortby=title&citation_for_view=yATJZvcAAAAJ:ldfaerwXgEUC},
year = {2019},
date = {2019-01-01},
school = {UNIVERSITY OF LEÓN},
abstract = {This thesis introduces algorithms, methods, and datasets aimed at classifying, mining information, and ranking web domains or similar resources containing text. The focus is on detecting web content that may indicate illegal activities, particularly in the Tor Darknet and Online Notepad Services (ONS), like Pastebin. Motivated by a collaboration with INCIBE, the research addresses the identification of criminal content in these areas, based on the assumption that the Tor network harbors a significant amount of illicit activity.},
keywords = {Darknet, Illegal Activities, Pastebin, Text classification, TOR Network},
pubstate = {published},
tppubtype = {phdthesis}
}
Al-Nabki, Wesam
Supervised machine learning for classification mining and ranking of illegal web contents Tesis doctoral
Universidad de León, 2019.
Resumen | Enlaces | BibTeX | Etiquetas: Darknet, machine learning, NER, Pastebin, Tor Darknet
@phdthesis{al-nabki_supervised_2019,
title = {Supervised machine learning for classification mining and ranking of illegal web contents},
author = {Wesam Al-Nabki},
url = {https://dialnet.unirioja.es/servlet/dctes?codigo=261157},
year = {2019},
date = {2019-01-01},
school = {Universidad de León},
abstract = {This thesis develops algorithms and datasets to classify and detect illegal activities in web domains, focusing on the Tor Darknet and services like Pastebin. Using machine learning, datasets like DUTA and DUTA-10K achieve high classification accuracy for Tor domains. Active Learning and Named Entity Recognition (NER) are used for classifying and identifying criminal content, while Graph Theory analyzes emerging products in Tor marketplaces. The thesis introduces ToRank for ranking influential onion domains, outperforming traditional ranking methods. It also compares content-based ranking techniques for detecting drug-related domains.},
keywords = {Darknet, machine learning, NER, Pastebin, Tor Darknet},
pubstate = {published},
tppubtype = {phdthesis}
}
Riesco, Adrián; Fidalgo, Eduardo; Al-Nabki, Wesam; Jáñez-Martino, Francisco; Alegre, Enrique
Classifying Pastebin content through the generation of PasteCC labeled dataset Proceedings Article
En: Hybrid Artificial Intelligent Systems: 14th International Conference, HAIS 2019, León, Spain, September 4–6, 2019, Proceedings 14, pp. 456–467, Springer International Publishing, 2019.
Resumen | Enlaces | BibTeX | Etiquetas: Cybercrime Detection, Logistic Regression, machine learning, Pastebin, Text classification, TF-IDF
@inproceedings{riesco_classifying_2019,
title = {Classifying Pastebin content through the generation of PasteCC labeled dataset},
author = {Adrián Riesco and Eduardo Fidalgo and Wesam Al-Nabki and Francisco Jáñez-Martino and Enrique Alegre},
url = {https://link.springer.com/chapter/10.1007/978-3-030-29859-3_39},
year = {2019},
date = {2019-01-01},
booktitle = {Hybrid Artificial Intelligent Systems: 14th International Conference, HAIS 2019, León, Spain, September 4–6, 2019, Proceedings 14},
pages = {456–467},
publisher = {Springer International Publishing},
abstract = {This paper presents the PasteCC_17K dataset, containing 17,640 text samples from Pastebin, classified into 15 categories, including 6 potentially illegal ones. The study evaluates different text representation techniques and classifiers, finding that TF-IDF with Logistic Regression offers the best performance, helping authorities detect suspicious content on Pastebin.},
keywords = {Cybercrime Detection, Logistic Regression, machine learning, Pastebin, Text classification, TF-IDF},
pubstate = {published},
tppubtype = {inproceedings}
}