coop/crates/common/src/media_extractor.rs

use gpui::SharedUri;
use regex::Regex;

/// Extracts media URLs from a string and returns both the extracted URLs
/// and the string with media URLs removed
pub struct MediaExtractor {
    image_regex: Regex,
    video_regex: Regex,
}

impl MediaExtractor {
    /// Creates a new MediaExtractor with compiled regex patterns
    pub fn new() -> Self {
        MediaExtractor {
            // Match common image extensions
            image_regex: Regex::new(
                r#"(?i)\bhttps?://[^\s<>"']+\.(?:jpg|jpeg|png|gif|bmp|webp|svg|ico)(?:\?[^\s<>"']*)?\b"#,
            ).unwrap(),
            // Match common video extensions
            video_regex: Regex::new(
                r#"(?i)\bhttps?://[^\s<>"']+\.(?:mp4|mov|avi|mkv|webm|flv|wmv|m4v|3gp)(?:\?[^\s<>"']*)?\b"#,
            ).unwrap(),
        }
    }

    /// Extracts all media URLs from a string
    pub fn extract_media_urls(&self, text: &str) -> Vec<SharedUri> {
        let mut urls = Vec::new();

        // Extract image URLs
        for capture in self.image_regex.find_iter(text) {
            urls.push(capture.as_str().to_string().into());
        }

        // Extract video URLs
        // for capture in self.video_regex.find_iter(text) {
        //     urls.push(capture.as_str().to_string().into());
        // }

        urls
    }

    /// Removes all media URLs from a string and returns the cleaned text
    pub fn remove_media_urls(&self, text: &str) -> String {
        let mut result = text.to_string();

        // Remove image URLs
        result = self.image_regex.replace_all(&result, "").to_string();

        // Remove video URLs
        // result = self.video_regex.replace_all(&result, "").to_string();

        // Clean up extra whitespace that might result from removal
        self.cleanup_text(&result)
    }

    /// Extracts media URLs and removes them from the string, returning both
    pub fn extract_and_remove(&self, text: &str) -> (Vec<SharedUri>, String) {
        let urls = self.extract_media_urls(text);
        let cleaned_text = self.remove_media_urls(text);
        (urls, cleaned_text)
    }

    /// Helper function to clean up text after URL removal
    fn cleanup_text(&self, text: &str) -> String {
        let text = text.trim();

        // Remove multiple consecutive spaces
        let re = Regex::new(r"\s+").unwrap();
        re.replace_all(text, " ").trim().to_string()
    }

    /// Validates if a URL is a valid media URL
    pub fn is_media_url(&self, url: &str) -> bool {
        self.image_regex.is_match(url) || self.video_regex.is_match(url)
    }

    /// Categorizes extracted URLs into images and videos
    pub fn categorize_urls(&self, urls: &[SharedUri]) -> (Vec<SharedUri>, Vec<SharedUri>) {
        let mut images = Vec::new();
        let mut videos = Vec::new();

        for url in urls {
            if self.image_regex.is_match(url) {
                images.push(url.clone());
            } else if self.video_regex.is_match(url) {
                videos.push(url.clone());
            }
        }

        (images, videos)
    }
}

impl Default for MediaExtractor {
    fn default() -> Self {
        Self::new()
    }
}

/// Convenience function for one-time extraction and removal
pub fn extract_and_remove_media_urls(text: &str) -> (Vec<SharedUri>, String) {
    let extractor = MediaExtractor::new();
    extractor.extract_and_remove(text)
}

/// Convenience function for just extracting media URLs
pub fn extract_media_urls(text: &str) -> Vec<SharedUri> {
    let extractor = MediaExtractor::new();
    extractor.extract_media_urls(text)
}

/// Convenience function for just removing media URLs
pub fn remove_media_urls(text: &str) -> String {
    let extractor = MediaExtractor::new();
    extractor.remove_media_urls(text)
}