add media extractor

2026-04-09 15:44:22 +07:00
parent 9ff18aae35
commit 72e3686f45
5 changed files with 189 additions and 48 deletions
--- a/crates/chat/src/message.rs
+++ b/crates/chat/src/message.rs
@@ -1,8 +1,8 @@
 use std::hash::Hash;
 use std::ops::Range;

-use common::{EventExt, NostrParser};
-use gpui::SharedString;
+use common::{EventExt, NostrParser, extract_and_remove_media_urls};
+use gpui::{SharedString, SharedUri};
 use nostr_sdk::prelude::*;

 /// New message.
@@ -132,6 +132,8 @@ pub struct RenderedMessage {
    pub author: PublicKey,
    /// The content/text of the message
    pub content: String,
+    /// List of media URLs in the message
+    pub media: Vec<SharedUri>,
    /// Message created time as unix timestamp
    pub created_at: Timestamp,
    /// List of mentioned public keys in the message
@@ -144,11 +146,13 @@ impl From<&Event> for RenderedMessage {
    fn from(val: &Event) -> Self {
        let mentions = extract_mentions(&val.content);
        let replies_to = extract_reply_ids(&val.tags);
+        let (media, string) = extract_and_remove_media_urls(&val.content);

        Self {
            id: val.id,
            author: val.pubkey,
-            content: val.content.clone(),
+            content: string,
+            media,
            created_at: val.created_at,
            mentions,
            replies_to,
@@ -160,12 +164,14 @@ impl From<&UnsignedEvent> for RenderedMessage {
    fn from(val: &UnsignedEvent) -> Self {
        let mentions = extract_mentions(&val.content);
        let replies_to = extract_reply_ids(&val.tags);
+        let (media, string) = extract_and_remove_media_urls(&val.content);

        Self {
            // Event ID must be known
            id: val.id.unwrap(),
            author: val.pubkey,
-            content: val.content.clone(),
+            content: string,
+            media,
            created_at: val.created_at,
            mentions,
            replies_to,
@@ -177,12 +183,14 @@ impl From<&NewMessage> for RenderedMessage {
    fn from(val: &NewMessage) -> Self {
        let mentions = extract_mentions(&val.rumor.content);
        let replies_to = extract_reply_ids(&val.rumor.tags);
+        let (media, string) = extract_and_remove_media_urls(&val.rumor.content);

        Self {
            // Event ID must be known
            id: val.rumor.id.unwrap(),
            author: val.rumor.pubkey,
-            content: val.rumor.content.clone(),
+            content: string,
+            media,
            created_at: val.rumor.created_at,
            mentions,
            replies_to,
--- a/crates/common/Cargo.toml
+++ b/crates/common/Cargo.toml
@@ -20,3 +20,4 @@ log.workspace = true
 dirs = "5.0"
 qrcode = "0.14.1"
 bech32 = "0.11.1"
+regex = "1.10"
--- a/crates/common/src/lib.rs
+++ b/crates/common/src/lib.rs
@@ -1,6 +1,7 @@
 pub use debounced_delay::*;
 pub use display::*;
 pub use event::*;
+pub use media_extractor::*;
 pub use parser::*;
 pub use paths::*;
 pub use range::*;
@@ -8,6 +9,7 @@ pub use range::*;
 mod debounced_delay;
 mod display;
 mod event;
+mod media_extractor;
 mod parser;
 mod paths;
 mod range;
--- a/crates/common/src/media_extractor.rs
+++ b/crates/common/src/media_extractor.rs
@@ -0,0 +1,117 @@
+use gpui::SharedUri;
+use regex::Regex;
+
+/// Extracts media URLs from a string and returns both the extracted URLs
+/// and the string with media URLs removed
+pub struct MediaExtractor {
+    image_regex: Regex,
+    video_regex: Regex,
+}
+
+impl MediaExtractor {
+    /// Creates a new MediaExtractor with compiled regex patterns
+    pub fn new() -> Self {
+        MediaExtractor {
+            // Match common image extensions
+            image_regex: Regex::new(
+                r#"(?i)\bhttps?://[^\s<>"']+\.(?:jpg|jpeg|png|gif|bmp|webp|svg|ico)(?:\?[^\s<>"']*)?\b"#,
+            ).unwrap(),
+            // Match common video extensions
+            video_regex: Regex::new(
+                r#"(?i)\bhttps?://[^\s<>"']+\.(?:mp4|mov|avi|mkv|webm|flv|wmv|m4v|3gp)(?:\?[^\s<>"']*)?\b"#,
+            ).unwrap(),
+        }
+    }
+
+    /// Extracts all media URLs from a string
+    pub fn extract_media_urls(&self, text: &str) -> Vec<SharedUri> {
+        let mut urls = Vec::new();
+
+        // Extract image URLs
+        for capture in self.image_regex.find_iter(text) {
+            urls.push(capture.as_str().to_string().into());
+        }
+
+        // Extract video URLs
+        for capture in self.video_regex.find_iter(text) {
+            urls.push(capture.as_str().to_string().into());
+        }
+
+        urls
+    }
+
+    /// Removes all media URLs from a string and returns the cleaned text
+    pub fn remove_media_urls(&self, text: &str) -> String {
+        let mut result = text.to_string();
+
+        // Remove image URLs
+        result = self.image_regex.replace_all(&result, "").to_string();
+
+        // Remove video URLs
+        result = self.video_regex.replace_all(&result, "").to_string();
+
+        // Clean up extra whitespace that might result from removal
+        self.cleanup_text(&result)
+    }
+
+    /// Extracts media URLs and removes them from the string, returning both
+    pub fn extract_and_remove(&self, text: &str) -> (Vec<SharedUri>, String) {
+        let urls = self.extract_media_urls(text);
+        let cleaned_text = self.remove_media_urls(text);
+        (urls, cleaned_text)
+    }
+
+    /// Helper function to clean up text after URL removal
+    fn cleanup_text(&self, text: &str) -> String {
+        let text = text.trim();
+
+        // Remove multiple consecutive spaces
+        let re = Regex::new(r"\s+").unwrap();
+        re.replace_all(text, " ").to_string()
+    }
+
+    /// Validates if a URL is a valid media URL
+    pub fn is_media_url(&self, url: &str) -> bool {
+        self.image_regex.is_match(url) || self.video_regex.is_match(url)
+    }
+
+    /// Categorizes extracted URLs into images and videos
+    pub fn categorize_urls(&self, urls: &[SharedUri]) -> (Vec<SharedUri>, Vec<SharedUri>) {
+        let mut images = Vec::new();
+        let mut videos = Vec::new();
+
+        for url in urls {
+            if self.image_regex.is_match(url) {
+                images.push(url.clone());
+            } else if self.video_regex.is_match(url) {
+                videos.push(url.clone());
+            }
+        }
+
+        (images, videos)
+    }
+}
+
+impl Default for MediaExtractor {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Convenience function for one-time extraction and removal
+pub fn extract_and_remove_media_urls(text: &str) -> (Vec<SharedUri>, String) {
+    let extractor = MediaExtractor::new();
+    extractor.extract_and_remove(text)
+}
+
+/// Convenience function for just extracting media URLs
+pub fn extract_media_urls(text: &str) -> Vec<SharedUri> {
+    let extractor = MediaExtractor::new();
+    extractor.extract_media_urls(text)
+}
+
+/// Convenience function for just removing media URLs
+pub fn remove_media_urls(text: &str) -> String {
+    let extractor = MediaExtractor::new();
+    extractor.remove_media_urls(text)
+}