209 lines
7.9 KiB
PHP
Executable File
209 lines
7.9 KiB
PHP
Executable File
<?php
|
|
// Cargar configuración
|
|
require_once __DIR__ . '/../../config/config.php';
|
|
|
|
class HtmlToDiscordMarkdownConverter
|
|
{
|
|
private const DISCORD_MESSAGE_LIMIT = 2000;
|
|
|
|
public function convert(string $html): string
|
|
{
|
|
$dom = new DOMDocument();
|
|
libxml_use_internal_errors(true); // Suppress warnings for malformed HTML
|
|
// Use LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD to prevent adding html/body tags
|
|
$dom->loadHTML('<?xml encoding="utf-8" ?>' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
|
|
libxml_clear_errors();
|
|
|
|
$markdown = '';
|
|
|
|
foreach ($dom->childNodes as $node) {
|
|
$markdown .= $this->processNode($node);
|
|
}
|
|
|
|
// Clean up extra newlines
|
|
$markdown = preg_replace('/\n{3,}/', "\n\n", $markdown);
|
|
$markdown = trim($markdown);
|
|
|
|
return $markdown;
|
|
}
|
|
|
|
private function processNode(DOMNode $node): string
|
|
{
|
|
$output = '';
|
|
|
|
switch ($node->nodeType) {
|
|
case XML_TEXT_NODE:
|
|
$output .= $this->decodeHtmlEntities($node->nodeValue);
|
|
break;
|
|
case XML_ELEMENT_NODE:
|
|
switch (strtolower($node->nodeName)) {
|
|
case 'b':
|
|
case 'strong':
|
|
$output .= '**' . $this->processChildren($node) . '**';
|
|
break;
|
|
case 'i':
|
|
case 'em':
|
|
$output .= '*' . $this->processChildren($node) . '*';
|
|
break;
|
|
case 'u':
|
|
$output .= '__' . $this->processChildren($node) . '__';
|
|
break;
|
|
case 'a':
|
|
$href = $node->getAttribute('href');
|
|
|
|
// Ignorar nodos de texto con solo espacios en blanco para un análisis más preciso.
|
|
$realChildNodes = [];
|
|
foreach ($node->childNodes as $child) {
|
|
if ($child->nodeType === XML_TEXT_NODE && trim($child->nodeValue) === '') {
|
|
continue;
|
|
}
|
|
$realChildNodes[] = $child;
|
|
}
|
|
|
|
// Si el único hijo real es una imagen, procesarla directamente.
|
|
if (count($realChildNodes) === 1 && strtolower($realChildNodes[0]->nodeName) === 'img') {
|
|
$output .= $this->processChildren($node);
|
|
} else {
|
|
// Si no, trátalo como un enlace de texto normal.
|
|
$text = $this->processChildren($node);
|
|
$output .= "[{$text}]({$href})";
|
|
}
|
|
break;
|
|
case 'p':
|
|
$output .= $this->processChildren($node) . "\n\n";
|
|
break;
|
|
case 'br':
|
|
$output .= "\n";
|
|
break;
|
|
case 'ul':
|
|
case 'ol':
|
|
$listContent = $this->processChildren($node);
|
|
$listItems = explode("\n", trim($listContent));
|
|
$formattedList = [];
|
|
$counter = 1;
|
|
foreach($listItems as $item) {
|
|
if(empty(trim($item))) continue;
|
|
if (strtolower($node->nodeName) === 'ul') {
|
|
$formattedList[] = '- ' . trim($item);
|
|
} else {
|
|
$formattedList[] = ($counter++) . '. ' . trim($item);
|
|
}
|
|
}
|
|
$output .= implode("\n", $formattedList) . "\n\n";
|
|
break;
|
|
case 'li':
|
|
$output .= $this->processChildren($node);
|
|
break;
|
|
case 'img':
|
|
$src = $node->getAttribute('src');
|
|
if (!empty($src)) {
|
|
$absoluteImageUrl = $src;
|
|
// Convertir URL relativa a absoluta si es necesario
|
|
if (strpos($src, 'http') !== 0 && strpos($src, '//') !== 0) {
|
|
$base = rtrim(BOT_BASE_URL, '/');
|
|
$path = ltrim($src, '/');
|
|
$absoluteImageUrl = "{$base}/{$path}";
|
|
}
|
|
// Dejar solo la URL para que Discord la renderice
|
|
$output .= "\n" . $absoluteImageUrl . "\n";
|
|
}
|
|
break;
|
|
case 'div':
|
|
$output .= $this->processChildren($node);
|
|
break;
|
|
default:
|
|
// For unknown tags, just process their children
|
|
$output .= $this->processChildren($node);
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
return $output;
|
|
}
|
|
|
|
private function processChildren(DOMNode $node): string
|
|
{
|
|
$childrenOutput = '';
|
|
foreach ($node->childNodes as $child) {
|
|
$childrenOutput .= $this->processNode($child);
|
|
}
|
|
return $childrenOutput;
|
|
}
|
|
|
|
private function decodeHtmlEntities(string $encodedString): string
|
|
{
|
|
return html_entity_decode($encodedString, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
}
|
|
|
|
public function convertToArray(string $html): array
|
|
{
|
|
$parts = [];
|
|
$dom = new DOMDocument();
|
|
libxml_use_internal_errors(true);
|
|
$dom->loadHTML('<?xml encoding="utf-8" ?>' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
|
|
libxml_clear_errors();
|
|
|
|
foreach ($dom->childNodes as $node) {
|
|
$this->processNodeForArray($node, $parts);
|
|
}
|
|
|
|
return $parts;
|
|
}
|
|
|
|
private function processNodeForArray(DOMNode $node, array &$parts)
|
|
{
|
|
if ($node->nodeType === XML_TEXT_NODE) {
|
|
$this->addTextPart($parts, $this->decodeHtmlEntities($node->nodeValue));
|
|
return;
|
|
}
|
|
|
|
if ($node->nodeType !== XML_ELEMENT_NODE) {
|
|
return;
|
|
}
|
|
|
|
switch (strtolower($node->nodeName)) {
|
|
case 'img':
|
|
$src = $node->getAttribute('src');
|
|
if (!empty($src)) {
|
|
$absoluteImageUrl = $src;
|
|
if (strpos($src, 'http') !== 0 && strpos($src, '//') !== 0) {
|
|
$base = rtrim(BOT_BASE_URL, '/');
|
|
$path = ltrim($src, '/');
|
|
$absoluteImageUrl = "{$base}/{$path}";
|
|
}
|
|
$parts[] = ['type' => 'image', 'url' => $absoluteImageUrl];
|
|
}
|
|
break;
|
|
|
|
case 'p':
|
|
case 'div':
|
|
if ($node->hasChildNodes()) {
|
|
foreach ($node->childNodes as $child) {
|
|
$this->processNodeForArray($child, $parts);
|
|
}
|
|
}
|
|
$this->addTextPart($parts, "\n\n");
|
|
break;
|
|
|
|
default:
|
|
if ($node->hasChildNodes()) {
|
|
foreach ($node->childNodes as $child) {
|
|
$this->processNodeForArray($child, $parts);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
private function addTextPart(array &$parts, string $text)
|
|
{
|
|
if (empty($text)) return;
|
|
|
|
// Si la última parte fue texto, la unimos a ella.
|
|
if (!empty($parts) && end($parts)['type'] === 'text') {
|
|
$parts[key($parts)]['content'] .= $text;
|
|
} else {
|
|
$parts[] = ['type' => 'text', 'content' => $text];
|
|
}
|
|
}
|
|
} |